bitkeeper revision 1.1159.1.483 (41c0c417XYObowWqbfqU0cdLx30C9w)
authoriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Wed, 15 Dec 2004 23:09:11 +0000 (23:09 +0000)
committeriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Wed, 15 Dec 2004 23:09:11 +0000 (23:09 +0000)
Initial Intel VMX changes to support unmodified Linux guests on Intel's VT p
latform.

29 files changed:
.rootkeys
docs/misc/VMX_changes.txt [new file with mode: 0644]
xen/arch/x86/domain.c
xen/arch/x86/io_apic.c
xen/arch/x86/setup.c
xen/arch/x86/shadow.c
xen/arch/x86/time.c
xen/arch/x86/vmx.c [new file with mode: 0644]
xen/arch/x86/vmx_io.c [new file with mode: 0644]
xen/arch/x86/vmx_vmcs.c [new file with mode: 0644]
xen/arch/x86/x86_32/entry.S
xen/common/event_channel.c
xen/common/kernel.c
xen/common/softirq.c
xen/include/asm-x86/config.h
xen/include/asm-x86/cpufeature.h
xen/include/asm-x86/e820.h [new file with mode: 0644]
xen/include/asm-x86/mm.h
xen/include/asm-x86/msr.h
xen/include/asm-x86/processor.h
xen/include/asm-x86/shadow.h
xen/include/asm-x86/vmx.h [new file with mode: 0644]
xen/include/asm-x86/vmx_cpu.h [new file with mode: 0644]
xen/include/asm-x86/vmx_platform.h [new file with mode: 0644]
xen/include/asm-x86/vmx_vmcs.h [new file with mode: 0644]
xen/include/public/arch-x86_32.h
xen/include/public/io/ioreq.h [new file with mode: 0644]
xen/include/xen/sched.h
xen/include/xen/types.h

index c15c15cda5971bc818208da8fbb00546f8109387..c6ce0aa40a8bceddf45e240d534f19e748a8ce3b 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
@@ -15,6 +15,7 @@
 4187c1c7IWmBinGdI19kL4MuZ6RLbQ docs/check_pkgs
 3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/figs/xenlogo.eps
 418a3248xjIqmNKo0v_XQSfAvlBGFw docs/html.sty
+41c0c4116itF389v0CEWcmzue6zJkA docs/misc/VMX_changes.txt
 4022a73cgxX1ryj1HgS-IwwB6NUi2A docs/misc/XenDebugger-HOWTO
 412f4bd9sm5mCQ8BkrgKcAKZGadq7Q docs/misc/blkif-drivers-explained.txt
 40d6ccbfKKBq8jE0ula4eHEzBiQuDA docs/misc/xen_config.html
 3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c
 3ddb79bccYVzXZJyVaxuv5T42Z1Fsw xen/arch/x86/trampoline.S
 3ddb79bcOftONV9h4QCxXOfiT0h91w xen/arch/x86/traps.c
+41c0c411tD3C7TpfDMiFTf7BaNd_Dg xen/arch/x86/vmx.c
+41c0c411ODt8uEmV-yUxpQLpqimE5Q xen/arch/x86/vmx_io.c
+41c0c4128URE0dxcO15JME_MuKBPfg xen/arch/x86/vmx_vmcs.c
 419cbedeQDg8IrO3izo3o5rQNlo0kQ xen/arch/x86/x86_32/asm-offsets.c
 3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/x86/x86_32/domain_page.c
 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/x86/x86_32/entry.S
 3ddb79c34BFiXjBJ_cCKB0aCsV1IDw xen/include/asm-x86/desc.h
 40715b2dTokMLYGSuD58BnxOqyWVew xen/include/asm-x86/div64.h
 3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-x86/domain_page.h
+41c0c412Ufq5sAvri3dMHC1BXiO6Gw xen/include/asm-x86/e820.h
 3ddb79c3NU8Zy40OTrq3D-i30Y3t4A xen/include/asm-x86/fixmap.h
 3e2d29944GI24gf7vOP_7x8EyuqxeA xen/include/asm-x86/flushtlb.h
 3ddb79c39o75zPP0T1aQQ4mNrCAN2w xen/include/asm-x86/hardirq.h
 3ddb79c3ezddh34MdelJpa5tNR00Dw xen/include/asm-x86/system.h
 3ddb79c4HugMq7IYGxcQKFBpKwKhzA xen/include/asm-x86/types.h
 40cf1596saFaHD5DC5zvrSn7CDCWGQ xen/include/asm-x86/uaccess.h
+41c0c412k6GHYF3cJtDdw37ee3TVaw xen/include/asm-x86/vmx.h
+41c0c412hck3QX-6_MaXaISGkngQuA xen/include/asm-x86/vmx_cpu.h
+41c0c41243jC1mcArZx_t3YkBL4lTA xen/include/asm-x86/vmx_platform.h
+41c0c412lQ0NVVN9PsOSznQ-qhOiPA xen/include/asm-x86/vmx_vmcs.h
 418fbcfe_WliJPToeVM-9VStvym-hw xen/include/asm-x86/x86_32/asm_defns.h
 3ddb79c2ADvRmdexd9y3AYK9_NTx-Q xen/include/asm-x86/x86_32/current.h
 3ddb79c3mbqEM7QQr3zVq7NiBNhouA xen/include/asm-x86/x86_32/regs.h
 4121d149udGfSUGhn3k1ECz0bM31nQ xen/include/public/grant_table.h
 40f5623bqoi4GEoBiiUc6TZk1HjsMg xen/include/public/io/blkif.h
 40dc4076pVeE1kEEWzcUaNZin65kCA xen/include/public/io/domain_controller.h
+41c0c412FLc0gunlJl91qMYscFtXVA xen/include/public/io/ioreq.h
 40f5623cTZ80EwjWUBlh44A9F9i_Lg xen/include/public/io/netif.h
 4051db79512nOCGweabrFWO2M2h5ng xen/include/public/physdev.h
 40589968wmhPmV5-ENbBYmMjnedgKw xen/include/public/sched_ctl.h
diff --git a/docs/misc/VMX_changes.txt b/docs/misc/VMX_changes.txt
new file mode 100644 (file)
index 0000000..739d315
--- /dev/null
@@ -0,0 +1,90 @@
+Changes to Xen in support of Intel(R) Vanderpool Technology
+-------------------------------------------------------------
+
+Our VT extensions to the Xen hypervisor provide full platform
+virtualization, including CPU(s), memory, and I/O infrastructure. The
+generic code in Xen handles and schedules those virtual machines as it
+does for the existing para-virtualized domains.
+
+Full virtualization required by the OS guests requires full device
+virtualization as well. The device models in BOCHS
+(http://bochs.sourceforge.net/) were decoupled from the CPU
+virtualization, and are used to virtualize the legacy devices (such as
+keyboard, mouse, VGA, IDE) in the PC platform. At this point, the
+device models run in user mode on domain 0, not in the Xen hypervisor.
+
+We would like to thank Ian Pratt and Keir Fraser for reviewing our
+design and code intensively, and for providing numerous useful
+suggestions to improve the architecture and code. 
+
+We have a list of Intel team members who take credit for making this
+release happen: Yunhong Jiang, Nitin Kamble, Chengyuan Li, Xin Li,
+Xiaofeng Ling, Benjamin Liu, Asit Mallick, Jun Nakajima, Sunil Saxena,
+Arun Sharma, Edwin Zhai, Jeff Zheng, and Louis Zhuang. We'll continue
+to add more features to complete full virtualization in Xen using VT.
+
+The notes document the changes to the Xen hypervisor in order to add
+VT support. The changes to other areas, such as Control Panel will be
+added as we deliver the code.
+
+Summary of changes for the first release
+----------------------------------------
+December 15, 2004
+
+    * VT specific event handling and domain management were added. 
+
+    * Shadow mode was extended to support full 32-bit guests
+    
+    * Domain switching code was extended to support VT domain
+    
+    * I/O request handling was added to communicate with the device model
+
+    * Domain builder was extended to provide the environment when the
+      guest enters the protected mode, including E820 memory and VGA
+      info, typically obtained by BIOS calls.
+
+New code:
+---------
+    VT (Vanderpool Technology) is based on the new VMX (Virtual
+    Machine Extensions) architecture. The current release of the
+    software supports 32-bit only.
+
+    * arch/x86/vmx.[ch] and arch/x86/vmx_*.[ch]: created to handle
+      VMX-specific events in order to provide virtual machine.
+
+    * arch/x86/x86_32/entry.S: new code path was added to have the
+      first-level handler from VM exits. The first-level handler calls
+      the second-level handler in arch/x86/vmx.c.
+
+    * arch/x86/setup.c: new function start_vmx() to init_intel() to
+      enable VMX mode.
+
+    * include/asm-x86/config.h: #ifdef CONFIG_VMX was added.
+
+    * arch/x86/domain.c: new code patch was added to create a VMX
+      domain given the flag from the control panel.
+
+    * include/public/io/ioreq.h: A new data structure was added to
+      define the I/O requests between the Xen hypervisor and the
+      device models.
+
+Changes to the existing code:
+-----------------------------
+
+    * arch/x86/shadow.[ch]: new mode SHM_full_32 was added to support
+      full virtualization. The current Xen code assumes that the guest
+      page directory and tables have _machine_ (or host) physical page
+      frame numbers, and the new code allows to support _guest_
+      physical page frame numbers
+
+    * include/asm-x86/processor.h: struct arch_vmx_struct arch_vmx has
+      been added to the thread_struct data structure. The arch_vmx has
+      the addtional VMX-related CPU context.
+
+    * arch/x86/io_apic.c: reverse mapping between vector and irq has
+      been added. We will revisit this code when considering MSI
+      support.
+
+--- Jun
+
+
index 04b3e1269549f1d51f46456c1765501b1c3ffa6b..646bbc3aa0a46b027f9dafee97f6da0c6677efd9 100644 (file)
 #include <asm/shadow.h>
 #include <xen/console.h>
 #include <xen/elf.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <xen/kernel.h>
+#include <public/io/ioreq.h>
 #include <xen/multicall.h>
 
 #if !defined(CONFIG_X86_64BITMODE)
@@ -158,6 +162,9 @@ void machine_restart(char * __unused)
     smp_send_stop();
     disable_IO_APIC();
 #endif
+#ifdef CONFIG_VMX
+    stop_vmx();
+#endif
 
     if(!reboot_thru_bios) {
         /* rebooting needs to touch the page at absolute addr 0 */
@@ -239,6 +246,97 @@ void arch_do_createdomain(struct exec_domain *ed)
     }
 }
 
+#ifdef CONFIG_VMX
+void arch_vmx_do_resume(struct exec_domain *d) 
+{
+    vmx_do_resume(d);
+    reset_stack_and_jump(vmx_asm_do_resume);
+}
+
+void arch_vmx_do_launch(struct exec_domain *d) 
+{
+    vmx_do_launch(d);
+    reset_stack_and_jump(vmx_asm_do_launch);
+}
+
+static void monitor_mk_pagetable(struct exec_domain *ed)
+{
+    unsigned long mpfn;
+    l2_pgentry_t *mpl2e;
+    struct pfn_info *mpfn_info;
+    struct mm_struct *m = &ed->mm;
+    struct domain *d = ed->domain;
+
+    mpfn_info = alloc_domheap_page(NULL);
+    ASSERT( mpfn_info ); 
+
+    mpfn = (unsigned long) (mpfn_info - frame_table);
+    mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT);
+    memset(mpl2e, 0, PAGE_SIZE);
+
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+    m->monitor_table = mk_pagetable(mpfn << PAGE_SHIFT);
+    m->shadow_mode = SHM_full_32;
+
+    mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((__pa(d->mm_perdomain_pt) & PAGE_MASK) 
+                      | __PAGE_HYPERVISOR);
+
+    unmap_domain_mem(mpl2e);
+}
+
+static int vmx_final_setup_guestos(struct exec_domain *d,
+                                   full_execution_context_t *full_context)
+{
+    int error;
+    execution_context_t *context;
+    struct vmcs_struct *vmcs;
+    unsigned long guest_pa;
+
+    context = &full_context->cpu_ctxt;
+
+    /*
+     * Create a new VMCS
+     */
+    if (!(vmcs = alloc_vmcs())) {
+        printk("Failed to create a new VMCS\n");
+        return -ENOMEM;
+    }
+
+    memset(&d->thread.arch_vmx, 0, sizeof (struct arch_vmx_struct));
+
+    d->thread.arch_vmx.vmcs = vmcs;
+    error = construct_vmcs(&d->thread.arch_vmx, context, full_context, VMCS_USE_HOST_ENV);
+    if (error < 0) {
+        printk("Failed to construct a new VMCS\n");
+        goto out;
+    }
+
+    monitor_mk_pagetable(d);
+    guest_pa = pagetable_val(d->mm.pagetable);
+    clear_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state);
+
+    d->thread.arch_vmx.vmx_platform.real_mode_data = 
+        (unsigned long *) context->esi;
+
+    memset(&d->domain->shared_info->evtchn_mask[0], 0xff, 
+           sizeof(d->domain->shared_info->evtchn_mask));
+    clear_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_mask[0]);
+
+    d->thread.schedule_tail = arch_vmx_do_launch;
+
+    return 0;
+
+out:
+    free_vmcs(vmcs);
+    d->thread.arch_vmx.vmcs = 0;
+    return error;
+}
+#endif
+
 int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
 {
     unsigned long phys_basetab;
@@ -310,6 +408,11 @@ int arch_final_setup_guestos(struct exec_domain *d, full_execution_context_t *c)
         }
     }
 
+#ifdef CONFIG_VMX
+    if (c->flags & ECF_VMX_GUEST)
+        return vmx_final_setup_guestos(d, c);
+#endif
+
     return 0;
 }
 
@@ -356,7 +459,8 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
     struct tss_struct *tss = init_tss + smp_processor_id();
     execution_context_t *stack_ec = get_execution_context();
     int i;
-    
+    unsigned long vmx_domain = next_p->thread.arch_vmx.flags; 
+
     __cli();
 
     /* Switch guest general-register state. */
@@ -375,12 +479,6 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
                &next_p->thread.user_ctxt,
                sizeof(*stack_ec));
 
-        SET_FAST_TRAP(&next_p->thread);
-
-        /* Switch the guest OS ring-1 stack. */
-        tss->esp1 = next->guestos_sp;
-        tss->ss1  = next->guestos_ss;
-
         /* Maybe switch the debug registers. */
         if ( unlikely(next->debugreg[7]) )
         {
@@ -393,6 +491,24 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
             loaddebug(next, 7);
         }
 
+         if (vmx_domain) {
+            /* Switch page tables. */
+            write_ptbase(&next_p->mm);
+            set_current(next_p);
+            /* Switch GDT and LDT. */
+            __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->mm.gdt));
+
+            __sti();
+            return;
+         }
+        SET_FAST_TRAP(&next_p->thread);
+
+        /* Switch the guest OS ring-1 stack. */
+        tss->esp1 = next->guestos_sp;
+        tss->ss1  = next->guestos_ss;
+
         /* Switch page tables. */
         write_ptbase(&next_p->mm);
     }
index 5ab74351b8533cbd866a4cbdb1a1aa874a033b93..6d1593daff766802da0a6000aed68a4c66c6319f 100644 (file)
@@ -615,6 +615,10 @@ static inline int IO_APIC_irq_trigger(int irq)
 
 int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
 
+#ifdef CONFIG_VMX
+int vector_irq[256];
+#endif
+
 static int __init assign_irq_vector(int irq)
 {
        static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
@@ -637,6 +641,10 @@ next:
                panic("ran out of interrupt sources!");
 
        IO_APIC_VECTOR(irq) = current_vector;
+#ifdef CONFIG_VMX
+        vector_irq[current_vector] = irq;
+        printk("vector_irq[%x] = %d\n", current_vector, irq);
+#endif
        return current_vector;
 }
 
index cb8cb31a509de0b54a4e8e2585e9730e0134949f..3d0f4ea121ee9eaadd11d0f2b364615c724416bf 100644 (file)
@@ -146,6 +146,11 @@ static void __init init_intel(struct cpuinfo_x86 *c)
         }
     }
 #endif
+
+#ifdef CONFIG_VMX
+    start_vmx();
+#endif
+
 }
 
 static void __init init_amd(struct cpuinfo_x86 *c)
index ec08e653aff8fe43b3c7033edbfd437e622abb4a..24853764728e4534e35bfe29b821522920d06d49 100644 (file)
@@ -120,7 +120,10 @@ static inline int clear_shadow_page(
         /* We clear L2 pages by zeroing the guest entries. */
     case PGT_l2_page_table:
         p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
-        memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
+        if (m->shadow_mode == SHM_full_32)
+            memset(p, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
+        else 
+            memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
         unmap_domain_mem(p);
         break;
 
@@ -433,12 +436,24 @@ void unshadow_table(unsigned long gpfn, unsigned int type)
     free_shadow_page(&d->exec_domain[0]->mm, &frame_table[spfn]);
 }
 
+#ifdef CONFIG_VMX
+void vmx_shadow_clear_state(struct mm_struct *m) 
+{
+    SH_VVLOG("vmx_clear_shadow_state: \n");
+    clear_shadow_state(m);
+}
+#endif
+
+
 unsigned long shadow_l2_table( 
     struct mm_struct *m, unsigned long gpfn)
 {
     struct pfn_info *spfn_info;
     unsigned long    spfn;
-    l2_pgentry_t    *spl2e;
+    l2_pgentry_t    *spl2e = 0, *gpl2e;
+    unsigned long guest_gpfn;
+
+    __get_machine_to_phys(m, guest_gpfn, gpfn);
 
     SH_VVLOG("shadow_l2_table( %08lx )", gpfn);
 
@@ -451,33 +466,41 @@ unsigned long shadow_l2_table(
     perfc_incr(shadow_l2_pages);
 
     spfn = spfn_info - frame_table;
-
-    /* Mark pfn as being shadowed; update field to point at shadow. */
-    set_shadow_status(m, gpfn, spfn | PSH_shadowed);
+  /* Mark pfn as being shadowed; update field to point at shadow. */
+    set_shadow_status(m, guest_gpfn, spfn | PSH_shadowed);
  
-    spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
-
-    /*
-     * We could proactively fill in PDEs for pages that are already shadowed.
-     * However, we tried it and it didn't help performance. This is simpler.
-     */
-    memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-
 #ifdef __i386__
     /* Install hypervisor and 2x linear p.t. mapings. */
-    memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
-           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
-           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-    spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-    spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
-        mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) |
-                      __PAGE_HYPERVISOR);
+    if (m->shadow_mode == SHM_full_32) 
+        vmx_update_shadow_state(m, gpfn, spfn);
+    else {
+        spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
+        // can't use the linear map as we may not be in the right PT
+        gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+        /*
+         * We could proactively fill in PDEs for pages that are already shadowed.
+         * However, we tried it and it didn't help performance. This is simpler.
+         */
+        memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+
+        /* Install hypervisor and 2x linear p.t. mapings. */
+        memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+               &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+               HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+        spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+            mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+        spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+            mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+        spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+            mk_l2_pgentry(__pa(frame_table[gpfn].u.inuse.domain->mm_perdomain_pt) |
+                         __PAGE_HYPERVISOR);
+    }
 #endif
 
-    unmap_domain_mem(spl2e);
+    if (m->shadow_mode != SHM_full_32) 
+    {                           
+        unmap_domain_mem(spl2e);
+    }
 
     SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn);
     return spfn;
@@ -486,13 +509,13 @@ unsigned long shadow_l2_table(
 static void shadow_map_l1_into_current_l2(unsigned long va)
 { 
     struct mm_struct *m = &current->mm;
-    unsigned long    *gpl1e, *spl1e, gpde, spde, gl1pfn, sl1pfn, sl1ss;
+    unsigned long    *gpl1e, *spl1e, gpl2e, spl2e, gl1pfn, sl1pfn=0, sl1ss;
     struct pfn_info  *sl1pfn_info;
     int               i;
 
-    gpde = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+    __guest_get_pl2e(m, va, &gpl2e);
 
-    gl1pfn = gpde >> PAGE_SHIFT;
+    gl1pfn = gpl2e >> PAGE_SHIFT;
 
     sl1ss = __shadow_status(m, gl1pfn);
     if ( !(sl1ss & PSH_shadowed) )
@@ -510,11 +533,10 @@ static void shadow_map_l1_into_current_l2(unsigned long va)
 
         set_shadow_status(m, gl1pfn, PSH_shadowed | sl1pfn);
 
-        l2pde_general(m, &gpde, &spde, sl1pfn);
+        l2pde_general(m, &gpl2e, &spl2e, sl1pfn);
 
-        linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
-        shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] =
-            mk_l2_pgentry(spde);
+        __guest_set_pl2e(m, va, gpl2e);
+        __shadow_set_pl2e(m, va, spl2e);
 
         gpl1e = (unsigned long *) &(linear_pg_table[
             (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]);
@@ -531,13 +553,38 @@ static void shadow_map_l1_into_current_l2(unsigned long va)
         SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn);
 
         sl1pfn = sl1ss & PSH_pfn_mask;
-        l2pde_general(m, &gpde, &spde, sl1pfn);
-
-        linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
-        shadow_linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
+        l2pde_general(m, &gpl2e, &spl2e, sl1pfn);
+        __guest_set_pl2e(m, va, gpl2e);
+        __shadow_set_pl2e(m, va, spl2e);
     }              
 }
 
+#ifdef CONFIG_VMX
+void vmx_shadow_invlpg(struct mm_struct *m, unsigned long va)
+{
+    unsigned long gpte, spte, host_pfn;
+
+    if (__put_user(0L, (unsigned long *)
+                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
+        vmx_shadow_clear_state(m);
+        return;
+    }
+
+    if (__get_user(gpte, (unsigned long *)
+                   &linear_pg_table[va >> PAGE_SHIFT])) {
+        return;
+    }
+
+    host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+    spte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+
+    if (__put_user(spte, (unsigned long *)
+                   &shadow_linear_pg_table[va >> PAGE_SHIFT])) {
+        return;
+    }
+}
+#endif
+
 int shadow_fault(unsigned long va, long error_code)
 {
     unsigned long gpte, spte;
@@ -718,6 +765,9 @@ static int check_pte(
     int level, int i)
 {
     unsigned long mask, gpfn, spfn;
+#ifdef CONFIG_VMX
+    unsigned long guest_gpfn;
+#endif
 
     if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
         return 1;  /* always safe */
@@ -761,8 +811,20 @@ static int check_pte(
         if ( level < 2 )
             FAIL("Shadow in L1 entry?");
 
-        if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
-            FAIL("spfn problem g.sf=%08lx", __shadow_status(m, gpfn));
+        if (m->shadow_mode == SHM_full_32) {
+
+            guest_gpfn = phys_to_machine_mapping[gpfn];
+
+            if ( __shadow_status(m, guest_gpfn) != (PSH_shadowed | spfn) )
+                FAIL("spfn problem g.sf=%08lx", 
+                     __shadow_status(m, guest_gpfn) );
+            
+        } else {
+            if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
+                FAIL("spfn problem g.sf=%08lx", 
+                     __shadow_status(m, gpfn) );
+        }
+
     }
 
     return 1;
@@ -800,6 +862,7 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s)
     unsigned long gpfn, spfn;
     int           i;
     l2_pgentry_t *gpl2e, *spl2e;
+    unsigned long host_gpfn = 0;
 
     sh_check_name = s;
 
@@ -809,20 +872,29 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s)
 
     gpfn = gptbase >> PAGE_SHIFT;
 
-    if ( !(__shadow_status(m, gpfn) & PSH_shadowed) )
+    __get_phys_to_machine(m, host_gpfn, gpfn);
+  
+    if ( ! (__shadow_status(m, gpfn) & PSH_shadowed) )
     {
         printk("%s-PT %08lx not shadowed\n", s, gptbase);
-        if ( __shadow_status(m, gpfn) != 0 )
-            BUG();
-        return 0;
-    }
+
+        if( __shadow_status(m, gpfn) != 0 ) BUG();
+            return 0;
+    }   
  
     spfn = __shadow_status(m, gpfn) & PSH_pfn_mask;
 
-    if ( __shadow_status(m, gpfn) != (PSH_shadowed | spfn) )
-        FAILPT("ptbase shadow inconsistent1");
+    if ( ! __shadow_status(m, gpfn) == (PSH_shadowed | spfn) )
+            FAILPT("ptbase shadow inconsistent1");
+
+    if (m->shadow_mode == SHM_full_32) 
+    {
+        host_gpfn = phys_to_machine_mapping[gpfn];
+        gpl2e = (l2_pgentry_t *) map_domain_mem( host_gpfn << PAGE_SHIFT );
+
+    } else
+        gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
 
-    gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
     spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
 
     if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
@@ -830,7 +902,6 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s)
                 ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) )
     {
-        printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
         for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; 
               i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT);
               i++ )
@@ -851,11 +922,12 @@ int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s)
                                    L2_PAGETABLE_SHIFT]),
                (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 
-    if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
-          ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | 
+    if (m->shadow_mode != SHM_full_32) {
+        if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+              ((__pa(frame_table[gpfn].u.inuse.domain->mm.perdomain_pt) | 
             __PAGE_HYPERVISOR))) )
-        FAILPT("hypervisor per-domain map inconsistent");
-
+            FAILPT("hypervisor per-domain map inconsistent");
+    }
 
     /* Check the whole L2. */
     for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
index cc9ef381b934c64e09d82c445b05ade27370776a..3ff5e7f135648ac44c0a2bbe97879ed0b132e4aa 100644 (file)
@@ -50,7 +50,7 @@ static s_time_t        stime_irq;       /* System time at last 'time update' */
 static unsigned long   wc_sec, wc_usec; /* UTC time at last 'time update'.   */
 static rwlock_t        time_lock = RW_LOCK_UNLOCKED;
 
-static void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs)
+void timer_interrupt(int irq, void *dev_id, struct xen_regs *regs)
 {
     write_lock_irq(&time_lock);
 
diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c
new file mode 100644 (file)
index 0000000..f42c596
--- /dev/null
@@ -0,0 +1,913 @@
+/*
+ * vmx.c: handling VMX architecture-related VM exits
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/shadow.h>
+#include <asm/regs.h>
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/types.h>
+#include <asm/msr.h>
+#include <asm/spinlock.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <public/io/ioreq.h>
+
+int vmcs_size;
+unsigned int opt_vmx_debug_level;
+
+int start_vmx()
+{
+    struct vmcs_struct *vmcs;
+    unsigned long ecx;
+    u64 phys_vmcs;      /* debugging */
+
+    vmcs_size = VMCS_SIZE;
+    /*
+     * Xen does not fill x86_capability words except 0.
+     */
+    ecx = cpuid_ecx(1);
+    boot_cpu_data.x86_capability[4] = ecx;
+
+    if (!(test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability)))
+        return 0;
+
+    set_in_cr4(X86_CR4_VMXE);   /* Enable VMXE */
+
+    if (!(vmcs = alloc_vmcs())) {
+        printk("Failed to allocate VMCS\n");    
+        return 0;
+    }
+
+    phys_vmcs = (u64) virt_to_phys(vmcs);
+
+    if (!(__vmxon(phys_vmcs))) {
+        printk("VMXON is done\n");
+    }
+
+    return 1;
+}
+
+void stop_vmx()
+{
+    if (test_bit(X86_FEATURE_VMXE, &boot_cpu_data.x86_capability))
+        __vmxoff();
+}
+
+/*
+ * Not all cases recevie valid value in the VM-exit instruction length field.
+ */
+#define __get_instruction_length(len) \
+    __vmread(INSTRUCTION_LEN, &(len)); \
+     if ((len) < 1 || (len) > 15) \
+        __vmx_bug(&regs);
+
+static void inline __update_guest_eip(unsigned long inst_len) 
+{
+    unsigned long current_eip;
+
+    __vmread(GUEST_EIP, &current_eip);
+    __vmwrite(GUEST_EIP, current_eip + inst_len);
+}
+
+
+#include <asm/domain_page.h>
+
+static int vmx_do_page_fault(unsigned long va, unsigned long error_code) 
+{
+    unsigned long eip, pfn;
+    unsigned int index;
+    unsigned long gpde = 0;
+    int result;
+    struct exec_domain *ed = current;
+    struct mm_struct *m = &ed->mm;
+
+#if VMX_DEBUG
+    {
+        __vmread(GUEST_EIP, &eip);
+        VMX_DBG_LOG(DBG_LEVEL_VMMU, 
+                "vmx_do_page_fault = 0x%lx, eip = %lx, erro_code = %lx\n", 
+                va, eip, error_code);
+    }
+#endif
+    /*
+     * Set up guest page directory cache to make linear_pt_table[] work.
+     */
+    __guest_get_pl2e(m, va, &gpde);
+    if (!(gpde & _PAGE_PRESENT))
+        return 0;
+
+    index = (va >> L2_PAGETABLE_SHIFT);
+    if (!l2_pgentry_val(m->guest_pl2e_cache[index])) {
+        pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
+
+        VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault: pagetable = %lx\n",
+                pagetable_val(m->pagetable));
+
+        m->guest_pl2e_cache[index] = 
+            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    }
+
+    if ((result = shadow_fault(va, error_code)))
+        return result;
+    
+    return 0;       /* failed to resolve, i.e raise #PG */
+}
+
+static void vmx_do_general_protection_fault(struct xen_regs *regs) 
+{
+    unsigned long eip, error_code;
+
+    __vmread(GUEST_EIP, &eip);
+    __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+            "vmx_general_protection_fault: eip = %lx, erro_code = %lx\n",
+            eip, error_code);
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+            "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n",
+            regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi);
+
+    __vmx_bug(regs);
+}
+
+static void vmx_vmexit_do_cpuid(unsigned long input, struct xen_regs *regs) 
+{
+    int eax, ebx, ecx, edx;
+    unsigned long eip;
+
+    __vmread(GUEST_EIP, &eip);
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+            "do_cpuid: (eax) %x, (ebx) %x, (ecx) %x, (edx) %x, (esi) %x, (edi) %x\n", regs->eax, regs->ebx, regs->ecx, regs->edx, regs->esi, regs->edi);
+
+    cpuid(input, &eax, &ebx, &ecx, &edx);
+
+    if (input == 1) {
+        clear_bit(X86_FEATURE_PSE, &edx);
+        clear_bit(X86_FEATURE_PAE, &edx);
+        clear_bit(X86_FEATURE_PSE36, &edx);
+    }
+
+    regs->eax = (unsigned long) eax;
+    regs->ebx = (unsigned long) ebx;
+    regs->ecx = (unsigned long) ecx;
+    regs->edx = (unsigned long) edx;
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+            "vmx_vmexit_do_cpuid: eip: %lx, input: %lx, out:eax=%x, ebx=%x, ecx=%x, edx=%x\n", 
+            eip, input, eax, ebx, ecx, edx);
+
+}
+
+#define CASE_GET_REG_P(REG, reg)    \
+    case REG_ ## REG: reg_p = &(regs->reg); break
+
+static void vmx_dr_access (unsigned long exit_qualification, struct xen_regs *regs)
+{
+    unsigned int reg;
+    u32 *reg_p = 0;
+    struct exec_domain *ed = current;
+    u32 eip;
+
+    __vmread(GUEST_EIP, &eip);
+
+    reg = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+                "vmx_dr_access : eip=%08x, reg=%d, exit_qualification = %lx\n",
+                eip, reg, exit_qualification);
+
+    switch(exit_qualification & DEBUG_REG_ACCESS_REG) {
+        CASE_GET_REG_P(EAX, eax);
+        CASE_GET_REG_P(ECX, ecx);
+        CASE_GET_REG_P(EDX, edx);
+        CASE_GET_REG_P(EBX, ebx);
+        CASE_GET_REG_P(EBP, ebp);
+        CASE_GET_REG_P(ESI, esi);
+        CASE_GET_REG_P(EDI, edi);
+    case REG_ESP:
+        break;  
+    default:
+        __vmx_bug(regs);
+    }
+        
+    switch (exit_qualification & DEBUG_REG_ACCESS_TYPE) {
+    case TYPE_MOV_TO_DR: 
+        /* don't need to check the range */
+        if (reg != REG_ESP)
+            ed->thread.debugreg[reg] = *reg_p; 
+        else {
+            unsigned long value;
+            __vmread(GUEST_ESP, &value);
+            ed->thread.debugreg[reg] = value;
+        }
+        break;
+    case TYPE_MOV_FROM_DR:
+        if (reg != REG_ESP)
+            *reg_p = ed->thread.debugreg[reg];
+        else {
+            __vmwrite(GUEST_ESP, ed->thread.debugreg[reg]);
+        }
+        break;
+    }
+}
+
+/*
+ * Invalidate the TLB for va. Invalidate the shadow page corresponding
+ * the address va.
+ */
+static void vmx_vmexit_do_invlpg(unsigned long va) 
+{
+    unsigned long eip;
+    struct exec_domain *d = current;
+    unsigned int index;
+
+    __vmread(GUEST_EIP, &eip);
+
+    VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_vmexit_do_invlpg:eip=%08lx, va=%08lx\n",
+            eip, va);
+
+    /*
+     * We do the safest things first, then try to update the shadow
+     * copying from guest
+     */
+    vmx_shadow_invlpg(&d->mm, va);
+    index = (va >> L2_PAGETABLE_SHIFT);
+    d->mm.guest_pl2e_cache[index] = mk_l2_pgentry(0); /* invalidate pgd cache */
+}
+
+static inline void guest_pl2e_cache_invalidate(struct mm_struct *m) 
+{
+    /*
+     * Need to optimize this
+     */
+    memset(m->guest_pl2e_cache, 0, PAGE_SIZE);
+}
+
+static inline unsigned long gva_to_gpa(unsigned long gva)
+{
+    unsigned long gpde, gpte, pfn, index;
+    struct exec_domain *d = current;
+    struct mm_struct *m = &d->mm;
+
+    __guest_get_pl2e(m, gva, &gpde);
+    index = (gva >> L2_PAGETABLE_SHIFT);
+
+    pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
+
+    m->guest_pl2e_cache[index] = 
+            mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+    if ( unlikely(__get_user(gpte, (unsigned long *)
+                             &linear_pg_table[gva >> PAGE_SHIFT])) )
+    {
+        printk("gva_to_gpa EXIT: read gpte faulted" );
+        return 0;
+    }
+
+    if ( !(gpte & _PAGE_PRESENT) )
+    {
+        printk("gva_to_gpa - EXIT: gpte not present (%lx)",gpte );
+        return 0;
+    }
+
+    return (gpte & PAGE_MASK) + (gva & ~PAGE_MASK); 
+}
+
+static void vmx_io_instruction(struct xen_regs *regs, 
+                   unsigned long exit_qualification, unsigned long inst_len) 
+{
+    struct exec_domain *d = current;
+    vcpu_iodata_t *vio;
+    ioreq_t *p;
+    unsigned long addr;
+    unsigned long eip;
+
+    extern long evtchn_send(int lport);
+    extern long do_block(void);
+
+    __vmread(GUEST_EIP, &eip);
+
+    VMX_DBG_LOG(DBG_LEVEL_1, 
+            "vmx_io_instruction: eip=%08lx, exit_qualification = %lx\n",
+            eip, exit_qualification);
+
+    if (test_bit(6, &exit_qualification))
+        addr = (exit_qualification >> 16) & (0xffff);
+    else
+        addr = regs->edx & 0xffff;
+
+    if (addr == 0x80) {
+        __update_guest_eip(inst_len);
+        return;
+    }
+
+    vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+    if (vio == 0) {
+        VMX_DBG_LOG(DBG_LEVEL_1, "bad shared page: %lx\n", (unsigned long) vio);
+        domain_crash(); 
+    }
+    p = &vio->vp_ioreq;
+    p->dir = test_bit(3, &exit_qualification);  
+    set_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags);
+
+    p->pdata_valid = 0;
+    p->count = 1;
+    p->size = (exit_qualification & 7) + 1;
+
+    if (test_bit(4, &exit_qualification)) {
+        p->pdata_valid = 1;
+        p->u.pdata = (void *) ((p->dir == IOREQ_WRITE) ?
+            regs->esi
+            : regs->edi);
+        p->u.pdata = (void *) gva_to_gpa(p->u.data);
+        if (test_bit(5, &exit_qualification))
+            p->count = regs->ecx;
+        if ((p->u.data & PAGE_MASK) != 
+            ((p->u.data + p->count * p->size - 1) & PAGE_MASK)) {
+            printk("stringio crosses page boundary!\n");
+            if (p->u.data & (p->size - 1)) {
+                printk("Not aligned I/O!\n");
+                domain_crash();     
+            }
+            p->count = (PAGE_SIZE - (p->u.data & ~PAGE_MASK)) / p->size;
+        } else {
+            __update_guest_eip(inst_len);
+        }
+    } else if (p->dir == IOREQ_WRITE) {
+        p->u.data = regs->eax;
+        __update_guest_eip(inst_len);
+    } else
+        __update_guest_eip(inst_len);
+
+    p->addr = addr;
+    p->port_mm = 0;
+    p->state = STATE_IOREQ_READY;
+    evtchn_send(IOPACKET_PORT);
+    do_block();
+}
+
+#define CASE_GET_REG(REG, reg)  \
+    case REG_ ## REG: value = regs->reg; break
+
+/*
+ * Write to control registers
+ */
+static void mov_to_cr(int gp, int cr, struct xen_regs *regs)
+{
+    unsigned long value;
+    unsigned long old_cr;
+    struct exec_domain *d = current;
+
+    switch (gp) {
+        CASE_GET_REG(EAX, eax);
+        CASE_GET_REG(ECX, ecx);
+        CASE_GET_REG(EDX, edx);
+        CASE_GET_REG(EBX, ebx);
+        CASE_GET_REG(EBP, ebp);
+        CASE_GET_REG(ESI, esi);
+        CASE_GET_REG(EDI, edi);
+    case REG_ESP:
+        __vmread(GUEST_ESP, &value);
+        break;
+    default:
+        printk("invalid gp: %d\n", gp);
+        __vmx_bug(regs);
+    }
+    
+    VMX_DBG_LOG(DBG_LEVEL_1, "mov_to_cr: CR%d, value = %lx, \n", cr, value);
+    VMX_DBG_LOG(DBG_LEVEL_1, "current = %lx, \n", (unsigned long) current);
+
+    switch(cr) {
+    case 0: 
+    {
+        unsigned long old_base_pfn = 0, pfn;
+
+        /* 
+         * CR0:
+         * We don't want to lose PE and PG.
+         */
+        __vmwrite(GUEST_CR0, (value | X86_CR0_PE | X86_CR0_PG));
+        __vmwrite(CR0_READ_SHADOW, value);
+
+        if (value & (X86_CR0_PE | X86_CR0_PG) &&
+            !test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) {
+            /*
+             * Enable paging
+             */
+            set_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state);
+            /*
+             * The guest CR3 must be pointing to the guest physical.
+             */
+            if (!(pfn = phys_to_machine_mapping[
+                      d->thread.arch_vmx.cpu_cr3 >> PAGE_SHIFT])) 
+            {
+                VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value = %lx\n", 
+                        d->thread.arch_vmx.cpu_cr3);
+                domain_crash(); /* need to take a clean path */
+            }
+            old_base_pfn = pagetable_val(d->mm.pagetable) >> PAGE_SHIFT;
+            /*
+             * Now mm.pagetable points to machine physical.
+             */
+            d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+
+            VMX_DBG_LOG(DBG_LEVEL_VMMU, "New mm.pagetable = %lx\n", 
+                    (unsigned long) (pfn << PAGE_SHIFT));
+
+            shadow_lock(&d->mm);
+            shadow_mode_enable(d->domain, SHM_full_32); 
+            shadow_unlock(&d->mm);
+
+            __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+            /* 
+             * mm->shadow_table should hold the next CR3 for shadow
+             */
+            VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, pfn = %lx\n", 
+                    d->thread.arch_vmx.cpu_cr3, pfn);
+            put_page_and_type(&frame_table[old_base_pfn]);
+
+        }
+        break;
+    }
+    case 3: 
+    {
+        unsigned long pfn;
+
+        /*
+         * If paging is not enabled yet, simply copy the valut to CR3.
+         */
+        if (!test_bit(VMX_CPU_STATE_PG_ENABLED, &d->thread.arch_vmx.cpu_state)) {
+            d->thread.arch_vmx.cpu_cr3 = value;
+            return;
+        }
+        
+        guest_pl2e_cache_invalidate(&d->mm);
+        /*
+         * We make a new one if the shadow does not exist.
+         */
+        if (value == d->thread.arch_vmx.cpu_cr3) {
+            /* 
+             * This is simple TLB flush, implying the guest has 
+             * removed some translation or changed page attributes.
+             * We simply invalidate the shadow.
+             */
+            pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+            if ((pfn << PAGE_SHIFT) != pagetable_val(d->mm.pagetable))
+                __vmx_bug(regs);
+            vmx_shadow_clear_state(&d->mm);
+            shadow_invalidate(&d->mm);
+        } else {
+            /*
+             * If different, make a shadow. Check if the PDBR is valid
+             * first.
+             */
+            VMX_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx\n", value);
+            if ((value >> PAGE_SHIFT) > d->domain->max_pages)
+            {
+                VMX_DBG_LOG(DBG_LEVEL_VMMU, 
+                        "Invalid CR3 value=%lx\n", value);
+                domain_crash(); /* need to take a clean path */
+            }
+            pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+            vmx_shadow_clear_state(&d->mm);
+            d->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
+            shadow_mk_pagetable(&d->mm);
+            /* 
+             * mm->shadow_table should hold the next CR3 for shadow
+             */
+            d->thread.arch_vmx.cpu_cr3 = value;
+            VMX_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx\n", 
+                    value);
+            __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+        }
+        break;
+    }
+    case 4:         
+        /* CR4 */
+        if (value & X86_CR4_PAE)
+            __vmx_bug(regs);    /* not implemented */
+        __vmread(CR4_READ_SHADOW, &old_cr);
+        
+        __vmwrite(GUEST_CR4, (value | X86_CR4_VMXE));
+        __vmwrite(CR4_READ_SHADOW, value);
+
+        /*
+         * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
+         * all TLB entries except global entries.
+         */
+        if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) {
+            vmx_shadow_clear_state(&d->mm);
+            shadow_invalidate(&d->mm);
+            guest_pl2e_cache_invalidate(&d->mm);
+        }
+        break;
+    default:
+        printk("invalid cr: %d\n", gp);
+        __vmx_bug(regs);
+    }
+}   
+
+#define CASE_SET_REG(REG, reg)      \
+    case REG_ ## REG:       \
+    regs->reg = value;      \
+    break
+
+/*
+ * Read from control registers. CR0 and CR4 are read from the shadow.
+ */
+static void mov_from_cr(int cr, int gp, struct xen_regs *regs)
+{
+    unsigned long value;
+    struct exec_domain *d = current;
+
+    if (cr != 3)
+        __vmx_bug(regs);
+
+    value = (unsigned long) d->thread.arch_vmx.cpu_cr3;
+    ASSERT(value);
+
+    switch (gp) {
+        CASE_SET_REG(EAX, eax);
+        CASE_SET_REG(ECX, ecx);
+        CASE_SET_REG(EDX, edx);
+        CASE_SET_REG(EBX, ebx);
+        CASE_SET_REG(EBP, ebp);
+        CASE_SET_REG(ESI, esi);
+        CASE_SET_REG(EDI, edi);
+    case REG_ESP:
+        __vmwrite(GUEST_ESP, value);
+        regs->esp = value;
+        break;
+    default:
+        printk("invalid gp: %d\n", gp);
+        __vmx_bug(regs);
+    }
+
+    VMX_DBG_LOG(DBG_LEVEL_VMMU, "mov_from_cr: CR%d, value = %lx, \n", cr, value);
+}
+
+static void vmx_cr_access (unsigned long exit_qualification, struct xen_regs *regs)
+{
+    unsigned int gp, cr;
+    unsigned long value;
+
+    switch (exit_qualification & CONTROL_REG_ACCESS_TYPE) {
+    case TYPE_MOV_TO_CR:
+        gp = exit_qualification & CONTROL_REG_ACCESS_REG;
+        cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
+        mov_to_cr(gp, cr, regs);
+        break;
+    case TYPE_MOV_FROM_CR:
+        gp = exit_qualification & CONTROL_REG_ACCESS_REG;
+        cr = exit_qualification & CONTROL_REG_ACCESS_NUM;
+        mov_from_cr(cr, gp, regs);
+        break;
+    case TYPE_CLTS:
+        __vmread(GUEST_CR0, &value);
+        value &= ~X86_CR0_TS; /* clear TS */
+        __vmwrite(GUEST_CR0, value);
+
+        __vmread(CR0_READ_SHADOW, &value);
+        value &= ~X86_CR0_TS; /* clear TS */
+        __vmwrite(CR0_READ_SHADOW, value);
+        break;
+    default:
+        __vmx_bug(regs);
+        break;
+    }
+}
+
+static inline void vmx_do_msr_read(struct xen_regs *regs)
+{
+    VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read: ecx=%x, eax=%x, edx=%x",
+            regs->ecx, regs->eax, regs->edx);
+
+    rdmsr(regs->ecx, regs->eax, regs->edx);
+
+    VMX_DBG_LOG(DBG_LEVEL_1, "vmx_do_msr_read returns: ecx=%x, eax=%x, edx=%x",
+            regs->ecx, regs->eax, regs->edx);
+}
+
+/*
+ * Need to use this exit to rescheule
+ */
+static inline void vmx_vmexit_do_hlt()
+{
+    extern long do_block(void);
+#if VMX_DEBUG
+    unsigned long eip;
+    __vmread(GUEST_EIP, &eip);
+#endif
+    VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_hlt:eip=%08lx\n", eip);
+    __enter_scheduler();
+}
+
+static inline void vmx_vmexit_do_mwait()
+{
+#if VMX_DEBUG
+    unsigned long eip;
+    __vmread(GUEST_EIP, &eip);
+#endif
+    VMX_DBG_LOG(DBG_LEVEL_1, "vmx_vmexit_do_mwait:eip=%08lx\n", eip);
+    __enter_scheduler();
+}
+
+#define BUF_SIZ     256
+#define MAX_LINE    80
+char print_buf[BUF_SIZ];
+static int index;
+
+static void vmx_print_line(const char c, struct exec_domain *d) 
+{
+
+    if (index == MAX_LINE || c == '\n') {
+        if (index == MAX_LINE) {
+            print_buf[index++] = c;
+        }
+        print_buf[index] = '\0';
+        printk("(GUEST: %u) %s\n", d->domain->id, (char *) &print_buf);
+        index = 0;
+    }
+    else
+        print_buf[index++] = c;
+}
+
+#ifdef XEN_DEBUGGER
+void save_xen_regs(struct xen_regs *regs)
+{
+    __vmread(GUEST_SS_SELECTOR, &regs->xss);
+    __vmread(GUEST_ESP, &regs->esp);
+    __vmread(GUEST_EFLAGS, &regs->eflags);
+    __vmread(GUEST_CS_SELECTOR, &regs->xcs);
+    __vmread(GUEST_EIP, &regs->eip);
+
+    __vmread(GUEST_GS_SELECTOR, &regs->xgs);
+    __vmread(GUEST_FS_SELECTOR, &regs->xfs);
+    __vmread(GUEST_ES_SELECTOR, &regs->xes);
+    __vmread(GUEST_DS_SELECTOR, &regs->xds);
+}
+
+void restore_xen_regs(struct xen_regs *regs)
+{
+    __vmwrite(GUEST_SS_SELECTOR, regs->xss);
+    __vmwrite(GUEST_ESP, regs->esp);
+    __vmwrite(GUEST_EFLAGS, regs->eflags);
+    __vmwrite(GUEST_CS_SELECTOR, regs->xcs);
+    __vmwrite(GUEST_EIP, regs->eip);
+
+    __vmwrite(GUEST_GS_SELECTOR, regs->xgs);
+    __vmwrite(GUEST_FS_SELECTOR, regs->xfs);
+    __vmwrite(GUEST_ES_SELECTOR, regs->xes);
+    __vmwrite(GUEST_DS_SELECTOR, regs->xds);
+}
+#endif
+
+asmlinkage void vmx_vmexit_handler(struct xen_regs regs)
+{
+    unsigned int exit_reason, idtv_info_field;
+    unsigned long exit_qualification, eip, inst_len = 0;
+    struct exec_domain *d = current;
+    int error;
+
+    if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
+        __vmx_bug(&regs);
+    
+    __vmread(IDT_VECTORING_INFO_FIELD, &idtv_info_field);
+    if (idtv_info_field & INTR_INFO_VALID_MASK) {
+        __vmwrite(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
+        if ((idtv_info_field & 0xff) == 14) {
+            unsigned long error_code;
+
+            __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+            printk("#PG error code: %lx\n", error_code);
+        }
+        VMX_DBG_LOG(DBG_LEVEL_1, "idtv_info_field=%x\n", 
+                idtv_info_field);
+    }
+
+    /* don't bother H/W interrutps */
+    if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
+        exit_reason != EXIT_REASON_VMCALL &&
+        exit_reason != EXIT_REASON_IO_INSTRUCTION)
+        VMX_DBG_LOG(DBG_LEVEL_0, "exit reason = %x\n", exit_reason);
+
+    if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+        __vmread(GUEST_EIP, &eip);
+        domain_crash();         
+        return;
+    }
+
+    switch (exit_reason) {
+    case EXIT_REASON_EXCEPTION_NMI:
+    {
+#define VECTOR_DB   1
+#define VECTOR_BP   3
+#define VECTOR_GP   13
+#define VECTOR_PG   14
+
+        /*
+         * We don't set the software-interrupt exiting (INT n). 
+         * (1) We can get an exception (e.g. #PG) in the guest, or
+         * (2) NMI
+         */
+        int error;
+        unsigned int vector;
+        unsigned long va;
+        unsigned long error_code;
+
+        if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
+            && !(vector & INTR_INFO_VALID_MASK))
+            __vmx_bug(&regs);
+        vector &= 0xff;
+
+        switch (vector) {
+#ifdef XEN_DEBUGGER
+        case VECTOR_DB:
+        {
+            save_xen_regs(&regs);
+            pdb_handle_exception(1, &regs, 1);
+            restore_xen_regs(&regs);
+            break;
+        }
+        case VECTOR_BP:
+        {
+            save_xen_regs(&regs);
+            pdb_handle_exception(3, &regs, 1);
+            restore_xen_regs(&regs);
+            break;
+        }
+#endif
+        case VECTOR_GP:
+        {
+            vmx_do_general_protection_fault(&regs);
+            break;  
+        }
+        case VECTOR_PG:
+        {
+            __vmread(EXIT_QUALIFICATION, &va);
+            __vmread(VM_EXIT_INTR_ERROR_CODE, &error_code);
+            VMX_DBG_LOG(DBG_LEVEL_VMMU, 
+                    "eax=%x, ebx=%x, ecx=%x, edx=%x, esi=%x, edi=%x\n", regs.eax, regs.ebx, regs.ecx, regs.edx, regs.esi, regs.edi);
+
+            if (!(error = vmx_do_page_fault(va, error_code))) {
+                /*
+                 * Inject #PG using Interruption-Information Fields
+                 */
+                unsigned long intr_fields;
+
+                intr_fields = (INTR_INFO_VALID_MASK | 
+                           INTR_TYPE_EXCEPTION |
+                           INTR_INFO_DELIEVER_CODE_MASK |
+                           VECTOR_PG);
+                __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields);
+                __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+                d->thread.arch_vmx.cpu_cr2 = va;
+            }
+            break;
+        }
+        default:
+            __vmx_bug(&regs);
+            break;
+        }
+        break;
+    }
+    case EXIT_REASON_EXTERNAL_INTERRUPT: 
+    {
+        extern int vector_irq[];
+        extern asmlinkage void do_IRQ(struct xen_regs);
+        extern void smp_apic_timer_interrupt(struct xen_regs *);
+        extern void timer_interrupt(int, void *, struct xen_regs *);
+        unsigned int    vector;
+
+        if ((error = __vmread(VM_EXIT_INTR_INFO, &vector))
+            && !(vector & INTR_INFO_VALID_MASK))
+            __vmx_bug(&regs);
+
+        vector &= 0xff;
+        local_irq_disable();
+
+        if (vector == LOCAL_TIMER_VECTOR) {
+            smp_apic_timer_interrupt(&regs);
+        } else {
+            regs.entry_vector = (vector == FIRST_DEVICE_VECTOR?
+                     0 : vector_irq[vector]);
+            do_IRQ(regs);
+        }
+        break;
+    }
+    case EXIT_REASON_PENDING_INTERRUPT:
+        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
+              MONITOR_CPU_BASED_EXEC_CONTROLS);
+        vmx_intr_assist(d);
+        break;
+    case EXIT_REASON_TASK_SWITCH:
+        __vmx_bug(&regs);
+        break;
+    case EXIT_REASON_CPUID:
+        __get_instruction_length(inst_len);
+        vmx_vmexit_do_cpuid(regs.eax, &regs);
+        __update_guest_eip(inst_len);
+        break;
+    case EXIT_REASON_HLT:
+        __get_instruction_length(inst_len);
+        __update_guest_eip(inst_len);
+        vmx_vmexit_do_hlt();
+        break;
+    case EXIT_REASON_INVLPG:
+    {
+        unsigned long   va;
+
+        __vmread(EXIT_QUALIFICATION, &va);
+        vmx_vmexit_do_invlpg(va);
+        __get_instruction_length(inst_len);
+        __update_guest_eip(inst_len);
+        break;
+    }
+    case EXIT_REASON_VMCALL:
+        __get_instruction_length(inst_len);
+        __vmread(GUEST_EIP, &eip);
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        vmx_print_line(regs.eax, d); /* provides the current domain */
+        __update_guest_eip(inst_len);
+        break;
+    case EXIT_REASON_CR_ACCESS:
+    {
+        __vmread(GUEST_EIP, &eip);
+        __get_instruction_length(inst_len);
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+
+        VMX_DBG_LOG(DBG_LEVEL_1, "eip = %lx, inst_len =%lx, exit_qualification = %lx\n", 
+                eip, inst_len, exit_qualification);
+        vmx_cr_access(exit_qualification, &regs);
+        __update_guest_eip(inst_len);
+        break;
+    }
+    case EXIT_REASON_DR_ACCESS:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);  
+        vmx_dr_access(exit_qualification, &regs);
+        __get_instruction_length(inst_len);
+        __update_guest_eip(inst_len);
+        break;
+    case EXIT_REASON_IO_INSTRUCTION:
+        __vmread(EXIT_QUALIFICATION, &exit_qualification);
+        __get_instruction_length(inst_len);
+        vmx_io_instruction(&regs, exit_qualification, inst_len);
+        break;
+    case EXIT_REASON_MSR_READ:
+        __get_instruction_length(inst_len);
+        vmx_do_msr_read(&regs);
+        __update_guest_eip(inst_len);
+        break;
+    case EXIT_REASON_MSR_WRITE:
+        __vmread(GUEST_EIP, &eip);
+        VMX_DBG_LOG(DBG_LEVEL_1, "MSR_WRITE: eip=%08lx, eax=%08x, edx=%08x",
+                eip, regs.eax, regs.edx);
+        /* just ignore this point */
+        __get_instruction_length(inst_len);
+        __update_guest_eip(inst_len);
+        break;
+    case EXIT_REASON_MWAIT_INSTRUCTION:
+        __get_instruction_length(inst_len);
+        __update_guest_eip(inst_len);
+        vmx_vmexit_do_mwait();
+        break;
+    default:
+        __vmx_bug(&regs);       /* should not happen */
+    }
+    return;
+}
+
+asmlinkage void load_cr2(void)
+{
+    struct exec_domain *d = current;
+
+    local_irq_disable();        
+    asm volatile("movl %0,%%cr2": :"r" (d->thread.arch_vmx.cpu_cr2));
+}
diff --git a/xen/arch/x86/vmx_io.c b/xen/arch/x86/vmx_io.c
new file mode 100644 (file)
index 0000000..881e297
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * vmx_io.c: handling I/O, interrupts related VMX entry/exit 
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <asm/vmx_vmcs.h>
+#include <xen/event.h>
+#include <public/io/ioreq.h>
+
+void vmx_io_assist(struct exec_domain *ed) 
+{
+    vcpu_iodata_t *vio;
+    ioreq_t *p;
+    struct domain *d = ed->domain;
+    execution_context_t *ec = get_execution_context();
+    unsigned long old_eax;
+    extern long do_block();
+    unsigned long eflags;
+    int dir;
+
+    /* clear the pending event */
+    ed->vcpu_info->evtchn_upcall_pending = 0;
+    /* clear the pending bit for port 2 */
+    clear_bit(IOPACKET_PORT>>5, &ed->vcpu_info->evtchn_pending_sel);
+    clear_bit(IOPACKET_PORT, &d->shared_info->evtchn_pending[0]);
+
+    vio = (vcpu_iodata_t *) ed->thread.arch_vmx.vmx_platform.shared_page_va;
+    if (vio == 0) {
+        VMX_DBG_LOG(DBG_LEVEL_1, 
+                    "bad shared page: %lx\n", (unsigned long) vio);
+        domain_crash();
+    }
+    p = &vio->vp_ioreq;
+    /* clear IO wait VMX flag */
+    if (test_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags)) {
+        if (p->state != STATE_IORESP_READY) {
+            printk("got a false I/O reponse\n");
+            do_block();
+        } else {
+            p->state = STATE_INVALID;
+        }
+        clear_bit(ARCH_VMX_IO_WAIT, &ed->thread.arch_vmx.flags);
+    } else {
+        return;
+    }
+
+    __vmread(GUEST_EFLAGS, &eflags);
+    dir = (eflags & X86_EFLAGS_DF);
+
+    if (p->dir == IOREQ_WRITE) {
+        if (p->pdata_valid) {
+            if (!dir)
+                ec->esi += p->count * p->size;
+            else
+                ec->esi -= p->count * p->size;
+            ec->ecx -= p->count;
+        }
+        return;
+    } else {
+        if (p->pdata_valid) {
+            if (!dir)
+                ec->edi += p->count * p->size;
+            else
+                ec->edi -= p->count * p->size;
+            ec->ecx -= p->count;
+            return;
+        }
+    }
+
+    old_eax = ec->eax;
+
+    switch(p->size) {
+    case 1:
+        ec->eax = (old_eax & 0xffffff00) | (p->u.data & 0xff);
+        break;
+    case 2:
+        ec->eax = (old_eax & 0xffff0000) | (p->u.data & 0xffff);
+        break;
+    case 4:
+        ec->eax = (p->u.data & 0xffffffff);
+        break;
+    default:
+        BUG();
+    }
+}
+
+static inline int __fls(unsigned long word)
+{
+    int bit;
+
+    __asm__("bsrl %1,%0"
+            :"=r" (bit)
+            :"rm" (word));
+    return word ? bit : -1;
+}
+
+
+/* Simple minded Local APIC priority implementation. Fix later */
+static __inline__ int find_highest_irq(unsigned long *pintr)
+{
+    if (pintr[7])
+        return __fls(pintr[7]) + (256-32*1);
+    if (pintr[6])
+        return __fls(pintr[6]) + (256-32*2);
+    if (pintr[5])
+        return __fls(pintr[5]) + (256-32*3);
+    if (pintr[4])
+        return __fls(pintr[4]) + (256-32*4);
+    if (pintr[3])
+        return __fls(pintr[3]) + (256-32*5);
+    if (pintr[2])
+        return __fls(pintr[2]) + (256-32*6);
+    if (pintr[1])
+        return __fls(pintr[1]) + (256-32*7);
+    return __fls(pintr[0]);
+}
+
+/*
+ * Return 0-255 for pending irq.
+ *        -1 when no pending.
+ */
+static inline int find_highest_pending_irq(struct exec_domain *d)
+{
+    vcpu_iodata_t *vio;
+
+    vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+    if (vio == 0) {
+        VMX_DBG_LOG(DBG_LEVEL_1, 
+                    "bad shared page: %lx\n", (unsigned long) vio);
+        domain_crash();
+    }
+        
+    return find_highest_irq(&vio->vp_intr[0]);
+}
+
+static inline void clear_highest_bit(struct exec_domain *d, int vector)
+{
+    vcpu_iodata_t *vio;
+
+    vio = (vcpu_iodata_t *) d->thread.arch_vmx.vmx_platform.shared_page_va;
+    if (vio == 0) {
+        VMX_DBG_LOG(DBG_LEVEL_1, 
+                    "bad shared page: %lx\n", (unsigned long) vio);
+        domain_crash();
+    }
+        
+    clear_bit(vector, &vio->vp_intr[0]);
+}
+
+static inline int irq_masked(unsigned long eflags)
+{
+    return ((eflags & X86_EFLAGS_IF) == 0);
+}
+
+void vmx_intr_assist(struct exec_domain *d) 
+{
+    int highest_vector = find_highest_pending_irq(d);
+    unsigned long intr_fields, eflags;
+
+    if (highest_vector == -1)
+        return;
+
+    __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
+    if (intr_fields & INTR_INFO_VALID_MASK) {
+        VMX_DBG_LOG(DBG_LEVEL_1, "vmx_intr_assist: intr_fields: %lx\n", 
+                    intr_fields);
+        return;
+    }
+
+    __vmread(GUEST_EFLAGS, &eflags);
+    if (irq_masked(eflags)) {
+        VMX_DBG_LOG(DBG_LEVEL_1, "guesting pending: %x, eflags: %lx\n", 
+                    highest_vector, eflags);
+        return;
+    }
+        
+    clear_highest_bit(d, highest_vector); 
+    intr_fields = (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | highest_vector);
+    __vmwrite(VM_ENTRY_INTR_INFO_FIELD, intr_fields);
+
+    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+
+    return;
+}
+
+void vmx_do_resume(struct exec_domain *d) 
+{
+    extern long do_block();
+
+    __vmwrite(HOST_CR3, pagetable_val(d->mm.monitor_table));
+    __vmwrite(GUEST_CR3, pagetable_val(d->mm.shadow_table));
+    __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+
+    if (event_pending(d)) {
+        if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0])) 
+            vmx_io_assist(d);
+
+        else if (test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags)) {
+            printk("got an event while blocked on I/O\n");
+            do_block();
+        }
+                
+        /* Assumption: device model will not inject an interrupt
+         * while an ioreq_t is pending i.e. the response and 
+         * interrupt can come together. But an interrupt without 
+         * a response to ioreq_t is not ok.
+         */
+    }
+    if (!test_bit(ARCH_VMX_IO_WAIT, &d->thread.arch_vmx.flags))
+        vmx_intr_assist(d);
+}
diff --git a/xen/arch/x86/vmx_vmcs.c b/xen/arch/x86/vmx_vmcs.c
new file mode 100644 (file)
index 0000000..755d481
--- /dev/null
@@ -0,0 +1,503 @@
+/*
+ * vmx_vmcs.c: VMCS management
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/errno.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <public/io/ioreq.h>
+#include <asm/domain_page.h>
+
+struct vmcs_struct *alloc_vmcs(void) 
+{
+    struct vmcs_struct *vmcs;
+    unsigned int cpu_sig = cpuid_eax(0x00000001);
+
+    vmcs = (struct vmcs_struct *) alloc_xenheap_pages(get_order(vmcs_size)); 
+    memset((char *) vmcs, 0, vmcs_size); /* don't remove this */
+
+    vmcs->vmcs_revision_id = (cpu_sig > 0xf41)? 3 : 1;
+    return vmcs;
+} 
+
+void free_vmcs(struct vmcs_struct *vmcs)
+{
+    int order;
+
+    order = (vmcs_size >> PAGE_SHIFT) - 1;
+    free_xenheap_pages((unsigned long) vmcs, order);
+}
+
+static inline int construct_vmcs_controls(void)
+{
+    int error = 0;
+        
+    error |= __vmwrite(PIN_BASED_VM_EXEC_CONTROL, 
+                       MONITOR_PIN_BASED_EXEC_CONTROLS);
+
+    error |= __vmwrite(CPU_BASED_VM_EXEC_CONTROL, 
+                       MONITOR_CPU_BASED_EXEC_CONTROLS);
+
+    error |= __vmwrite(VM_EXIT_CONTROLS, MONITOR_VM_EXIT_CONTROLS);
+    error |= __vmwrite(VM_ENTRY_CONTROLS, MONITOR_VM_ENTRY_CONTROLS);
+
+    return error;
+}
+
+#define GUEST_SEGMENT_LIMIT     0xffffffff      
+#define HOST_SEGMENT_LIMIT      0xffffffff      
+
+struct host_execution_env {
+    /* selectors */
+    unsigned short ldtr_selector;
+    unsigned short tr_selector;
+    unsigned short ds_selector;
+    unsigned short cs_selector;
+    /* limits */
+    unsigned short gdtr_limit;
+    unsigned short ldtr_limit;
+    unsigned short idtr_limit;
+    unsigned short tr_limit;
+    /* base */
+    unsigned long gdtr_base;
+    unsigned long ldtr_base;
+    unsigned long idtr_base;
+    unsigned long tr_base;
+    unsigned long ds_base;
+    unsigned long cs_base;
+    /* control registers */
+    unsigned long cr3;
+    unsigned long cr0;
+    unsigned long cr4;
+    unsigned long dr7;
+};
+
+#define round_pgdown(_p) ((_p)&PAGE_MASK) /* coped from domain.c */
+
+int vmx_setup_platform(struct exec_domain *d, execution_context_t *context)
+{
+    int i;
+    unsigned int n;
+    unsigned long *p, mpfn, offset, addr;
+    struct e820entry *e820p;
+    unsigned long gpfn = 0;
+
+    context->ebx = 0;   /* Linux expects ebx to be 0 for boot proc */
+
+    n = context->ecx;
+    if (n > 32) {
+        VMX_DBG_LOG(DBG_LEVEL_1, "Too many e820 entries: %d\n", n);
+        return -1;
+    }
+
+    addr = context->edi;
+    offset = (addr & ~PAGE_MASK);
+    addr = round_pgdown(addr);
+    mpfn = phys_to_machine_mapping[addr >> PAGE_SHIFT];
+    p = map_domain_mem(mpfn << PAGE_SHIFT);
+
+    e820p = (struct e820entry *) ((unsigned long) p + offset); 
+
+    for (i = 0; i < n; i++) {
+        if (e820p[i].type == E820_SHARED_PAGE) {
+            gpfn = (e820p[i].addr >> PAGE_SHIFT);
+            break;
+        }
+    }
+
+    if (gpfn == 0) {
+        VMX_DBG_LOG(DBG_LEVEL_1, "No shared Page ?\n");
+        return -1;
+    }   
+    unmap_domain_mem(p);        
+
+    mpfn = phys_to_machine_mapping[gpfn];
+    p = map_domain_mem(mpfn << PAGE_SHIFT);
+    d->thread.arch_vmx.vmx_platform.shared_page_va = (unsigned long) p;
+
+    return 0;
+}
+
+
+/*
+ * Add <guest pfn, machine pfn> mapping to per-domain mapping. Full
+ * virtualization does not need per-domain mapping.
+ */
+static int add_mapping_perdomain(struct exec_domain *d, unsigned long gpfn, 
+                                 unsigned long mpfn)
+{
+    struct pfn_info *page;
+    unsigned long pfn = 0;
+
+    /*
+     * We support up to 4GB memory for a guest at this point
+     */
+    if (gpfn > ENTRIES_PER_L2_PAGETABLE * ENTRIES_PER_L1_PAGETABLE)
+        return -1;
+
+    if (!(l1_pgentry_val(d->domain->mm_perdomain_pt[
+            gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]) & _PAGE_PRESENT))
+    {
+        page = (struct pfn_info *) alloc_domheap_page(NULL);
+        if (!page) {
+            return -1;
+        }
+
+        pfn = (unsigned long) (page - frame_table);
+        d->domain->mm_perdomain_pt[gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)] = 
+            mk_l1_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    }
+    phys_to_machine_mapping[gpfn] = mpfn;
+
+    return 0;
+}
+
+void vmx_do_launch(struct exec_domain *ed) 
+{
+/* Update CR3, GDT, LDT, TR */
+    unsigned int tr, cpu, error = 0;
+    struct host_execution_env host_env;
+    struct Xgt_desc_struct desc;
+    struct list_head *list_ent;
+    l2_pgentry_t *mpl2e, *guest_pl2e_cache;
+    unsigned long i, pfn = 0;
+    struct pfn_info *page;
+    execution_context_t *ec = get_execution_context();
+    struct domain *d = ed->domain;
+
+    cpu =  smp_processor_id();
+    ed->mm.min_pfn = ed->mm.max_pfn = 0;
+
+    spin_lock(&d->page_alloc_lock);
+    list_ent = d->page_list.next;
+
+    mpl2e = (l2_pgentry_t *) map_domain_mem(pagetable_val(ed->mm.monitor_table));
+    ASSERT(mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]);
+
+    for (i = 0; list_ent != &d->page_list; i++ ) {
+        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
+        ed->mm.min_pfn = min(ed->mm.min_pfn, pfn);
+        ed->mm.max_pfn = max(ed->mm.max_pfn, pfn);
+        list_ent = frame_table[pfn].list.next;
+        add_mapping_perdomain(ed, i, pfn);
+    }
+
+    spin_unlock(&d->page_alloc_lock);
+
+    page = (struct pfn_info *) alloc_domheap_page(NULL);
+    pfn = (unsigned long) (page - frame_table);
+
+    /*
+     * make linear_pt_table work for guest ptes
+     */
+    mpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((pfn << PAGE_SHIFT)| __PAGE_HYPERVISOR);
+
+    guest_pl2e_cache = map_domain_mem(pfn << PAGE_SHIFT);
+    memset(guest_pl2e_cache, 0, PAGE_SIZE); /* clean it up */
+    ed->mm.guest_pl2e_cache = guest_pl2e_cache; 
+        
+    unmap_domain_mem(mpl2e);
+
+    vmx_setup_platform(ed, ec);
+
+    __asm__ __volatile__ ("sgdt  (%%eax) \n" :: "a"(&desc) : "memory");
+    host_env.gdtr_limit = desc.size;
+    host_env.gdtr_base = desc.address;
+
+    error |= __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base);
+
+    error |= __vmwrite(GUEST_LDTR_SELECTOR, 0);
+    error |= __vmwrite(GUEST_LDTR_BASE, 0);
+    error |= __vmwrite(GUEST_LDTR_LIMIT, 0);
+        
+    __asm__ __volatile__ ("str  (%%eax) \n" :: "a"(&tr) : "memory");
+    host_env.tr_selector = tr;
+    host_env.tr_limit = sizeof(struct tss_struct);
+    host_env.tr_base = (unsigned long) &init_tss[cpu];
+
+    error |= __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector);
+    error |= __vmwrite(HOST_TR_BASE, host_env.tr_base);
+    error |= __vmwrite(GUEST_TR_BASE, 0);
+    error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
+
+    ed->mm.shadow_table = ed->mm.pagetable;
+    __vmwrite(GUEST_CR3, pagetable_val(ed->mm.pagetable));
+    __vmwrite(HOST_CR3, pagetable_val(ed->mm.monitor_table));
+    __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+
+    ed->thread.schedule_tail = arch_vmx_do_resume;
+}
+
+/*
+ * Initially set the same environement as host.
+ */
+static inline int 
+construct_init_vmcs_guest(execution_context_t *context, 
+                          full_execution_context_t *full_context,
+                          struct host_execution_env *host_env)
+{
+    int error = 0;
+    union vmcs_arbytes arbytes;
+    unsigned long dr7;
+    unsigned long eflags, shadow_cr;
+
+    /* MSR */
+    error |= __vmwrite(VM_EXIT_MSR_LOAD_ADDR, 0);
+    error |= __vmwrite(VM_EXIT_MSR_STORE_ADDR, 0);
+
+    error |= __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+    error |= __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+    error |= __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+    /* interrupt */
+    error |= __vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+    /* mask */
+    error |= __vmwrite(CR0_GUEST_HOST_MASK, 0xffffffff);
+    error |= __vmwrite(CR4_GUEST_HOST_MASK, 0xffffffff);
+
+    error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+    error |= __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+
+    /* TSC */
+    error |= __vmwrite(TSC_OFFSET, 0);
+    error |= __vmwrite(CR3_TARGET_COUNT, 0);
+
+    /* Guest Selectors */
+    error |= __vmwrite(GUEST_CS_SELECTOR, context->cs);
+    error |= __vmwrite(GUEST_ES_SELECTOR, context->es);
+    error |= __vmwrite(GUEST_SS_SELECTOR, context->ss);
+    error |= __vmwrite(GUEST_DS_SELECTOR, context->ds);
+    error |= __vmwrite(GUEST_FS_SELECTOR, context->fs);
+    error |= __vmwrite(GUEST_GS_SELECTOR, context->gs);
+
+    /* Guest segment Limits */
+    error |= __vmwrite(GUEST_CS_LIMIT, GUEST_SEGMENT_LIMIT);
+    error |= __vmwrite(GUEST_ES_LIMIT, GUEST_SEGMENT_LIMIT);
+    error |= __vmwrite(GUEST_SS_LIMIT, GUEST_SEGMENT_LIMIT);
+    error |= __vmwrite(GUEST_DS_LIMIT, GUEST_SEGMENT_LIMIT);
+    error |= __vmwrite(GUEST_FS_LIMIT, GUEST_SEGMENT_LIMIT);
+    error |= __vmwrite(GUEST_GS_LIMIT, GUEST_SEGMENT_LIMIT);
+
+    error |= __vmwrite(GUEST_IDTR_LIMIT, host_env->idtr_limit);
+
+    /* AR bytes */
+    arbytes.bytes = 0;
+    arbytes.fields.seg_type = 0x3;          /* type = 3 */
+    arbytes.fields.s = 1;                   /* code or data, i.e. not system */
+    arbytes.fields.dpl = 0;                 /* DPL = 3 */
+    arbytes.fields.p = 1;                   /* segment present */
+    arbytes.fields.default_ops_size = 1;    /* 32-bit */
+    arbytes.fields.g = 1;   
+    arbytes.fields.null_bit = 0;            /* not null */
+
+    error |= __vmwrite(GUEST_ES_AR_BYTES, arbytes.bytes);
+    error |= __vmwrite(GUEST_SS_AR_BYTES, arbytes.bytes);
+    error |= __vmwrite(GUEST_DS_AR_BYTES, arbytes.bytes);
+    error |= __vmwrite(GUEST_FS_AR_BYTES, arbytes.bytes);
+    error |= __vmwrite(GUEST_GS_AR_BYTES, arbytes.bytes);
+
+    arbytes.fields.seg_type = 0xb;          /* type = 0xb */
+    error |= __vmwrite(GUEST_CS_AR_BYTES, arbytes.bytes);
+
+    error |= __vmwrite(GUEST_GDTR_BASE, context->edx);
+    context->edx = 0;
+    error |= __vmwrite(GUEST_GDTR_LIMIT, context->eax);
+    context->eax = 0;
+
+    arbytes.fields.s = 0;                   /* not code or data segement */
+    arbytes.fields.seg_type = 0x2;          /* LTD */
+    arbytes.fields.default_ops_size = 0;    /* 16-bit */
+    arbytes.fields.g = 0;   
+    error |= __vmwrite(GUEST_LDTR_AR_BYTES, arbytes.bytes);
+
+    arbytes.fields.seg_type = 0xb;          /* 32-bit TSS (busy) */
+    error |= __vmwrite(GUEST_TR_AR_BYTES, arbytes.bytes);
+
+    error |= __vmwrite(GUEST_CR0, host_env->cr0); /* same CR0 */
+
+    /* Initally PG, PE are not set*/
+    shadow_cr = host_env->cr0;
+    shadow_cr &= ~(X86_CR0_PE | X86_CR0_PG);
+    error |= __vmwrite(CR0_READ_SHADOW, shadow_cr);
+    /* CR3 is set in vmx_final_setup_guestos */
+    error |= __vmwrite(GUEST_CR4, host_env->cr4);
+    shadow_cr = host_env->cr4;
+    shadow_cr &= ~(X86_CR4_PGE | X86_CR4_VMXE);
+    error |= __vmwrite(CR4_READ_SHADOW, shadow_cr);
+
+    error |= __vmwrite(GUEST_ES_BASE, host_env->ds_base);
+    error |= __vmwrite(GUEST_CS_BASE, host_env->cs_base);
+    error |= __vmwrite(GUEST_SS_BASE, host_env->ds_base);
+    error |= __vmwrite(GUEST_DS_BASE, host_env->ds_base);
+    error |= __vmwrite(GUEST_FS_BASE, host_env->ds_base);
+    error |= __vmwrite(GUEST_GS_BASE, host_env->ds_base);
+    error |= __vmwrite(GUEST_IDTR_BASE, host_env->idtr_base);
+
+    error |= __vmwrite(GUEST_ESP, context->esp);
+    error |= __vmwrite(GUEST_EIP, context->eip);
+
+    eflags = context->eflags & ~VMCS_EFLAGS_RESERVED_0; /* clear 0s */
+    eflags |= VMCS_EFLAGS_RESERVED_1; /* set 1s */
+
+    error |= __vmwrite(GUEST_EFLAGS, eflags);
+
+    error |= __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+    __asm__ __volatile__ ("mov %%dr7, %0\n" : "=r" (dr7));
+    error |= __vmwrite(GUEST_DR7, dr7);
+    error |= __vmwrite(GUEST_VMCS0, 0xffffffff);
+    error |= __vmwrite(GUEST_VMCS1, 0xffffffff);
+
+    return error;
+}
+
+static inline int construct_vmcs_host(struct host_execution_env *host_env)
+{
+    int error = 0;
+    unsigned long crn;
+    struct Xgt_desc_struct desc;
+
+    /* Host Selectors */
+    host_env->ds_selector = __HYPERVISOR_DS;
+    error |= __vmwrite(HOST_ES_SELECTOR, host_env->ds_selector);
+    error |= __vmwrite(HOST_SS_SELECTOR, host_env->ds_selector);
+    error |= __vmwrite(HOST_DS_SELECTOR, host_env->ds_selector);
+    error |= __vmwrite(HOST_FS_SELECTOR, host_env->ds_selector);
+    error |= __vmwrite(HOST_GS_SELECTOR, host_env->ds_selector);
+
+    host_env->cs_selector = __HYPERVISOR_CS;
+    error |= __vmwrite(HOST_CS_SELECTOR, host_env->cs_selector);
+
+    host_env->ds_base = 0;
+    host_env->cs_base = 0;
+    error |= __vmwrite(HOST_FS_BASE, host_env->ds_base);
+    error |= __vmwrite(HOST_GS_BASE, host_env->ds_base);
+
+/* Debug */
+    __asm__ __volatile__ ("sidt  (%%eax) \n" :: "a"(&desc) : "memory");
+    host_env->idtr_limit = desc.size;
+    host_env->idtr_base = desc.address;
+    error |= __vmwrite(HOST_IDTR_BASE, host_env->idtr_base);
+
+    __asm__ __volatile__ ("movl %%cr0,%0" : "=r" (crn) : );
+    host_env->cr0 = crn;
+    error |= __vmwrite(HOST_CR0, crn); /* same CR0 */
+
+    /* CR3 is set in vmx_final_setup_hostos */
+    __asm__ __volatile__ ("movl %%cr4,%0" : "=r" (crn) : ); 
+    host_env->cr4 = crn;
+    error |= __vmwrite(HOST_CR4, crn);
+    error |= __vmwrite(HOST_EIP, (unsigned long) vmx_asm_vmexit_handler);
+
+    return error;
+}
+
+/*
+ * Need to extend to support full virtualization.
+ * The variable use_host_env indicates if the new VMCS needs to use
+ * the same setups as the host has (xenolinux).
+ */
+
+int construct_vmcs(struct arch_vmx_struct *arch_vmx,
+                   execution_context_t *context,
+                   full_execution_context_t *full_context,
+                   int use_host_env)
+{
+    int error;
+    u64 vmcs_phys_ptr;
+
+    struct host_execution_env host_env;
+
+    if (use_host_env != VMCS_USE_HOST_ENV)
+        return -EINVAL;
+
+    memset(&host_env, 0, sizeof(struct host_execution_env));
+
+    vmcs_phys_ptr = (u64) virt_to_phys(arch_vmx->vmcs);
+
+    if ((error = __vmpclear (vmcs_phys_ptr))) {
+        printk("construct_vmcs: VMCLEAR failed\n");
+        return -EINVAL;         
+    }
+    if ((error = load_vmcs(arch_vmx, vmcs_phys_ptr))) {
+        printk("construct_vmcs: load_vmcs failed: VMCS = %lx\n",
+               (unsigned long) vmcs_phys_ptr);
+        return -EINVAL; 
+    }
+    if ((error = construct_vmcs_controls())) {
+        printk("construct_vmcs: construct_vmcs_controls failed\n");
+        return -EINVAL;         
+    }
+    /* host selectors */
+    if ((error = construct_vmcs_host(&host_env))) {
+        printk("construct_vmcs: construct_vmcs_host failed\n");
+        return -EINVAL;         
+    }
+    /* guest selectors */
+    if ((error = construct_init_vmcs_guest(context, full_context, &host_env))) {
+        printk("construct_vmcs: construct_vmcs_guest failed\n");
+        return -EINVAL;         
+    }       
+
+    if ((error |= __vmwrite(EXCEPTION_BITMAP, 
+                            MONITOR_DEFAULT_EXCEPTION_BITMAP))) {
+        printk("construct_vmcs: setting Exception bitmap failed\n");
+        return -EINVAL;         
+    }
+
+    return 0;
+}
+
+int load_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
+{
+    int error;
+
+    if ((error = __vmptrld(phys_ptr))) {
+        clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
+        return error;
+    }
+    set_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
+    return 0;
+}
+
+int store_vmcs(struct arch_vmx_struct *arch_vmx, u64 phys_ptr) 
+{
+    /* take the current VMCS */
+    __vmptrst(phys_ptr);
+    clear_bit(ARCH_VMX_VMCS_LOADED, &arch_vmx->flags); 
+    return 0;
+}
+
+void vm_launch_fail(unsigned long eflags)
+{
+    BUG();
+}
+
+void vm_resume_fail(unsigned long eflags)
+{
+    BUG();
+}
+
index b63d8203a0e19982f76420b534108d2cc63d514f..226d5f7ddc0a5cba62883b1017e3f792cd1ba330 100644 (file)
         andl $~3,reg;      \
         movl (reg),reg;
 
+#ifdef CONFIG_VMX
+/*
+ * At VMExit time the processor saves the guest selectors, esp, eip, 
+ * and eflags. Therefore we don't save them, but simply decrement 
+ * the kernel stack pointer to make it consistent with the stack frame 
+ * at usual interruption time. The eflags of the host is not saved by VMX, 
+ * and we set it to the fixed value.
+ *
+ * We also need the room, especially because orig_eax field is used 
+ * by do_IRQ(). Compared the xen_regs, we skip pushing for the following:
+ * (1/1)  u16 error_code;
+ * (2/1)  u16 entry_vector;
+ *   (2)  u32 eip;
+ *   (3)  u32 cs;
+ *   (4)  u32 eflags;
+ */
+#define VMX_MONITOR_EFLAGS     0x202 /* IF on */
+#define NR_SKIPPED_REGS        4       /* See the above explanation */
+#define VMX_SAVE_ALL_NOSEGREGS \
+        pushl $VMX_MONITOR_EFLAGS; \
+        popf; \
+        subl $(NR_SKIPPED_REGS*4), %esp; \
+        pushl %eax; \
+        pushl %ebp; \
+        pushl %edi; \
+        pushl %esi; \
+        pushl %edx; \
+        pushl %ecx; \
+        pushl %ebx;
+
+ENTRY(vmx_asm_vmexit_handler)
+        /* selectors are restored/saved by VMX */
+        VMX_SAVE_ALL_NOSEGREGS
+        call SYMBOL_NAME(vmx_vmexit_handler)
+        jmp vmx_asm_do_resume
+
+ENTRY(vmx_asm_do_launch)
+        popl %ebx
+        popl %ecx
+        popl %edx
+        popl %esi
+        popl %edi
+        popl %ebp
+        popl %eax
+        addl $(NR_SKIPPED_REGS*4), %esp
+        /* VMLUANCH */
+        .byte 0x0f,0x01,0xc2
+        pushf
+        call SYMBOL_NAME(vm_launch_fail)
+        hlt
+        
+        ALIGN
+        
+ENTRY(vmx_asm_do_resume)
+vmx_test_all_events:
+        GET_CURRENT(%ebx)
+/* test_all_events: */
+        xorl %ecx,%ecx
+        notl %ecx
+        cli                             # tests must not race interrupts
+/*test_softirqs:*/  
+        movl EDOMAIN_processor(%ebx),%eax
+        shl  $6,%eax                    # sizeof(irq_cpustat) == 64
+        test %ecx,SYMBOL_NAME(irq_stat)(%eax,1)
+        jnz  vmx_process_softirqs
+
+vmx_restore_all_guest:
+        call SYMBOL_NAME(load_cr2)
+        /* 
+         * Check if we are going back to VMX-based VM
+         * By this time, all the setups in the VMCS must be complete.
+         */
+        popl %ebx
+        popl %ecx
+        popl %edx
+        popl %esi
+        popl %edi
+        popl %ebp
+        popl %eax
+        addl $(NR_SKIPPED_REGS*4), %esp
+        /* VMRESUME */
+        .byte 0x0f,0x01,0xc3
+        pushf
+        call SYMBOL_NAME(vm_resume_fail)
+        /* Should never reach here */
+        hlt
+
+        ALIGN
+vmx_process_softirqs:
+        sti       
+        call SYMBOL_NAME(do_softirq)
+        jmp  vmx_test_all_events
+#endif
+        
+ENTRY(continue_nonidle_task)
+        GET_CURRENT(%ebx)
+        jmp test_all_events
+
         ALIGN
 restore_all_guest:
         testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
index d7b22caf6330caece0017cf921992818d84424f9..017a27dc17a27c98e9fe60ddaabe23580baa04aa 100644 (file)
@@ -474,7 +474,7 @@ static long evtchn_close(evtchn_close_t *close)
 }
 
 
-static long evtchn_send(int lport)
+long evtchn_send(int lport)
 {
     struct domain *ld = current->domain;
     struct exec_domain *rd;
index 3d43152d981fdea11d0bbe4197fa73c57f67f23c..90a202014d0a114b2c91c6e1311ee825bdb78111 100644 (file)
@@ -33,13 +33,13 @@ xmem_cache_t *exec_domain_struct_cachep;
 struct domain *dom0;
 
 vm_assist_info_t vm_assist_info[MAX_VMASST_TYPE + 1];
-
+#if 0
 struct e820entry {
     unsigned long addr_lo, addr_hi;        /* start of memory segment */
     unsigned long size_lo, size_hi;        /* size of memory segment */
     unsigned long type;                    /* type of memory segment */
 };
-
+#endif
 void start_of_day(void);
 
 /* opt_console: comma-separated list of console outputs. */
index 2a59925a074f3b8c8453697bf30221ed9b4bd2d9..ab2f243e15ef91632c0a6684ecbeebf6aca714b8 100644 (file)
@@ -13,6 +13,7 @@
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <asm/vmx_vmcs.h>
 #include <xen/softirq.h>
 
 irq_cpustat_t irq_stat[NR_CPUS];
index 99f47071f182fac4e52e82195ed2176e2a819f9a..1ac2eb358a15e0cc827fd8ba8b20103c2c9783d1 100644 (file)
@@ -7,6 +7,7 @@
 #ifndef __XEN_I386_CONFIG_H__
 #define __XEN_I386_CONFIG_H__
 
+#define CONFIG_VMX 1
 #define CONFIG_X86 1
 
 #define CONFIG_SMP 1
index 8b2e913bff57b321b19da543ef11ae6b2de1ec4a..6b02cb878ac6da2d4f31d8ac8616a49781fbe7a8 100644 (file)
@@ -71,6 +71,8 @@
 #define X86_FEATURE_P4         (3*32+ 7) /* P4 */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_MWAIT      (4*32+ 3) /* Monitor/Mwait support */
+#define X86_FEATURE_VMXE       (4*32+ 5) /* Virtual Machine Extensions */
 #define X86_FEATURE_EST                (4*32+ 7) /* Enhanced SpeedStep */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
diff --git a/xen/include/asm-x86/e820.h b/xen/include/asm-x86/e820.h
new file mode 100644 (file)
index 0000000..080065e
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * structures and definitions for the int 15, ax=e820 memory map
+ * scheme.
+ *
+ * In a nutshell, arch/i386/boot/setup.S populates a scratch table
+ * in the empty_zero_block that contains a list of usable address/size
+ * duples.   In arch/i386/kernel/setup.c, this information is
+ * transferred into the e820map, and in arch/i386/mm/init.c, that
+ * new information is used to mark pages reserved or not.
+ *
+ */
+#ifndef __E820_HEADER
+#define __E820_HEADER
+
+#define E820MAP        0x2d0           /* our map */
+#define E820MAX        32              /* number of entries in E820MAP */
+#define E820NR 0x1e8           /* # entries in E820MAP */
+
+#define E820_RAM       1
+#define E820_RESERVED  2
+#define E820_ACPI      3 /* usable as RAM once ACPI tables have been read */
+#define E820_NVS       4
+#define E820_IO                        16
+#define E820_SHARED_PAGE       17
+
+#define HIGH_MEMORY    (1024*1024)
+
+#ifndef __ASSEMBLY__
+
+struct e820map {
+    int nr_map;
+    struct e820entry {
+        unsigned long long addr;       /* start of memory segment */
+        unsigned long long size;       /* size of memory segment */
+        unsigned long type;            /* type of memory segment */
+    } map[E820MAX];
+};
+
+extern struct e820map e820;
+#endif/*!__ASSEMBLY__*/
+
+#endif/*__E820_HEADER*/
index 87ffe1ecc15c2cc1c531adc26831f24339dda6a5..e392d588cadb43af966e9c2575615966f07a45d4 100644 (file)
@@ -215,10 +215,19 @@ void synchronise_pagetables(unsigned long cpu_mask);
  * contiguous (or near contiguous) physical memory.
  */
 #undef  machine_to_phys_mapping
+/*
+ * The phys_to_machine_mapping is the reversed mapping of MPT for full
+ * virtualization.
+ */
+#undef  phys_to_machine_mapping
+
 #ifdef __x86_64__
 extern unsigned long *machine_to_phys_mapping;
 #else
 #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START)
+#ifdef CONFIG_VMX
+#define phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START)
+#endif
 #endif
 
 #define DEFAULT_GDT_ENTRIES     (LAST_RESERVED_GDT_ENTRY+1)
index a412963fd9920314391fe2db26c40764aebf0172..793860de54e0041e59fc6613bac2b99c4eac83b7 100644 (file)
 #define MSR_MTRRcap            0x0fe
 #define MSR_IA32_BBL_CR_CTL        0x119
 
+#define MSR_IA32_SYSENTER_CS           0x174
+#define MSR_IA32_SYSENTER_ESP          0x175
+#define MSR_IA32_SYSENTER_EIP          0x176
+
 #define MSR_IA32_MCG_CAP       0x179
 #define MSR_IA32_MCG_STATUS        0x17a
 #define MSR_IA32_MCG_CTL       0x17b
index a23c4a28099205dd16b4d51bfe53d8390cfb89d9..9935c9b2b6bea537e58654340bed4d3f5e27f242 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/pdb.h>
 #include <xen/config.h>
 #include <xen/spinlock.h>
+#include <asm/vmx_vmcs.h>
 #include <public/xen.h>
 #endif
 
@@ -84,6 +85,7 @@
 #define X86_CR4_PCE            0x0100  /* enable performance counters at ipl 3 */
 #define X86_CR4_OSFXSR         0x0200  /* enable fast FPU save and restore */
 #define X86_CR4_OSXMMEXCPT     0x0400  /* enable unmasked SSE exceptions */
+#define X86_CR4_VMXE           0x2000  /* enable VMX */
 
 /*
  * Trap/fault mnemonics.
@@ -429,6 +431,9 @@ struct thread_struct {
     struct desc_struct fast_trap_desc;
 #endif
     trap_info_t        traps[256];
+#ifdef CONFIG_VMX
+    struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
+#endif
 };
 
 #define IDT_ENTRIES 256
@@ -473,6 +478,18 @@ struct mm_struct {
     l1_pgentry_t *perdomain_ptes;
     pagetable_t  pagetable;
 
+#ifdef CONFIG_VMX
+
+#define SHM_full_32     (8) /* full virtualization for 32-bit */
+
+        pagetable_t  monitor_table;
+        l2_pgentry_t *vpagetable;      /* virtual address of pagetable */
+        l2_pgentry_t *shadow_vtable;   /* virtual address of shadow_table */
+        l2_pgentry_t *guest_pl2e_cache;        /* guest page directory cache */
+        unsigned long min_pfn;         /* min host physical */
+        unsigned long max_pfn;         /* max host physical */
+#endif
+
     /* shadow mode status and controls */
     unsigned int shadow_mode;  /* flags to control shadow table operation */
     pagetable_t  shadow_table;
@@ -502,14 +519,25 @@ struct mm_struct {
     char gdt[10]; /* NB. 10 bytes needed for x86_64. Use 6 bytes for x86_32. */
 };
 
+#define SHM_full_32     (8) /* full virtualization for 32-bit */
+
 static inline void write_ptbase(struct mm_struct *mm)
 {
     unsigned long pa;
 
+#ifdef CONFIG_VMX
+    if ( unlikely(mm->shadow_mode) ) {
+            if (mm->shadow_mode == SHM_full_32)
+                    pa = pagetable_val(mm->monitor_table);
+            else
+                    pa = pagetable_val(mm->shadow_table);   
+    }
+#else
     if ( unlikely(mm->shadow_mode) )
-        pa = pagetable_val(mm->shadow_table);
+            pa = pagetable_val(mm->shadow_table);    
+#endif
     else
-        pa = pagetable_val(mm->pagetable);
+            pa = pagetable_val(mm->pagetable);
 
     write_cr3(pa);
 }
@@ -533,18 +561,40 @@ long set_gdt(struct exec_domain *d,
 
 long set_debugreg(struct exec_domain *p, int reg, unsigned long value);
 
+struct microcode_header {
+        unsigned int hdrver;
+        unsigned int rev;
+        unsigned int date;
+        unsigned int sig;
+        unsigned int cksum;
+        unsigned int ldrver;
+        unsigned int pf;
+        unsigned int datasize;
+        unsigned int totalsize;
+        unsigned int reserved[3];
+};
+
 struct microcode {
-    unsigned int hdrver;
-    unsigned int rev;
-    unsigned int date;
-    unsigned int sig;
-    unsigned int cksum;
-    unsigned int ldrver;
-    unsigned int pf;
-    unsigned int reserved[5];
-    unsigned int bits[500];
+        struct microcode_header hdr;
+        unsigned int bits[0];
 };
 
+typedef struct microcode microcode_t;
+typedef struct microcode_header microcode_header_t;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+        unsigned int sig;
+        unsigned int pf;
+        unsigned int cksum;
+};
+
+struct extended_sigtable {
+        unsigned int count;
+        unsigned int cksum;
+        unsigned int reserved[3];
+        struct extended_signature sigs[0];
+};
 /* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */
 #define MICROCODE_IOCFREE      _IO('6',0)
 
index d94acbe707bdbe76be3d0b7992f16762271e2d11..29c9dbb5ca072067740462fb0fb68f8718724c6c 100644 (file)
@@ -17,6 +17,7 @@
 #define SHM_logdirty    (2) /* log pages that are dirtied */
 #define SHM_translate   (3) /* lookup machine pages in translation table */
 #define SHM_cow         (4) /* copy on write all dirtied pages */
+#define SHM_full_32     (8) /* full virtualization for 32-bit */
 
 #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
 #define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
@@ -37,6 +38,23 @@ extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte);
 extern void unshadow_table(unsigned long gpfn, unsigned int type);
 extern int shadow_mode_enable(struct domain *p, unsigned int mode);
 
+#ifdef CONFIG_VMX
+extern void vmx_shadow_clear_state(struct mm_struct *);
+extern void vmx_shadow_invlpg(struct mm_struct *, unsigned long);
+#endif
+
+#define  __get_machine_to_phys(m, guest_gpfn, gpfn)     \
+    if ((m)->shadow_mode == SHM_full_32)                \
+        (guest_gpfn) = machine_to_phys_mapping[(gpfn)]; \
+    else                                                \
+        (guest_gpfn) = (gpfn);
+
+#define  __get_phys_to_machine(m, host_gpfn, gpfn)     \
+    if ((m)->shadow_mode == SHM_full_32)               \
+        (host_gpfn) = phys_to_machine_mapping[(gpfn)]; \
+    else                                               \
+        (host_gpfn) = (gpfn);
+
 extern void __shadow_mode_disable(struct domain *d);
 static inline void shadow_mode_disable(struct domain *d)
 {
@@ -46,8 +64,14 @@ static inline void shadow_mode_disable(struct domain *d)
 
 extern unsigned long shadow_l2_table( 
     struct mm_struct *m, unsigned long gpfn);
+  
+static inline void shadow_invalidate(struct mm_struct *m) {
+    if (m->shadow_mode != SHM_full_32)
+        BUG();
+    memset(m->shadow_vtable, 0, PAGE_SIZE);
+}
 
-#define SHADOW_DEBUG      0
+#define SHADOW_DEBUG 0
 #define SHADOW_HASH_DEBUG 0
 
 struct shadow_status {
@@ -80,9 +104,55 @@ printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
     printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
            current->id , __LINE__ , ## _a )
 #else
-#define SH_VVLOG(_f, _a...) 
+#define SH_VVLOG(_f, _a...)
 #endif
 
+static inline void __shadow_get_pl2e(struct mm_struct *m, 
+                                unsigned long va, unsigned long *sl2e)
+{
+    if (m->shadow_mode == SHM_full_32) {
+        *sl2e = l2_pgentry_val(m->shadow_vtable[va >> L2_PAGETABLE_SHIFT]);
+    }
+    else
+        *sl2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+}
+
+static inline void __shadow_set_pl2e(struct mm_struct *m, 
+                                unsigned long va, unsigned long value)
+{
+    if (m->shadow_mode == SHM_full_32) {
+        m->shadow_vtable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+    }
+    else
+        linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+}
+
+static inline void __guest_get_pl2e(struct mm_struct *m, 
+                                unsigned long va, unsigned long *l2e)
+{
+    if (m->shadow_mode == SHM_full_32) {
+        *l2e = l2_pgentry_val(m->vpagetable[va >> L2_PAGETABLE_SHIFT]);
+    }
+    else
+        *l2e = l2_pgentry_val(linear_l2_table[va >> L2_PAGETABLE_SHIFT]);
+}
+
+static inline void __guest_set_pl2e(struct mm_struct *m, 
+                                unsigned long va, unsigned long value)
+{
+    if (m->shadow_mode == SHM_full_32) {
+        unsigned long pfn;
+
+        pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
+                m->guest_pl2e_cache[va >> L2_PAGETABLE_SHIFT] =
+                        mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+
+        m->vpagetable[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+    }
+    else
+        linear_l2_table[va >> L2_PAGETABLE_SHIFT] = mk_l2_pgentry(value);
+
+}
 
 /************************************************************************/
 
@@ -151,7 +221,6 @@ static inline void l1pte_write_fault(
     unsigned long spte = *spte_p;
 
     ASSERT(gpte & _PAGE_RW);
-
     gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
 
     switch ( m->shadow_mode )
@@ -163,9 +232,19 @@ static inline void l1pte_write_fault(
     case SHM_logdirty:
         spte = gpte | _PAGE_RW;
         __mark_dirty(m, gpte >> PAGE_SHIFT);
+
+    case SHM_full_32:
+    {
+        unsigned long host_pfn, host_gpte;
+        
+        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+        host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+        spte = host_gpte | _PAGE_RW;
+    }
         break;
     }
 
+    SH_VVLOG("updating spte=%lx gpte=%lx", spte, gpte);
     *gpte_p = gpte;
     *spte_p = spte;
 }
@@ -187,6 +266,17 @@ static inline void l1pte_read_fault(
     case SHM_logdirty:
         spte = gpte & ~_PAGE_RW;
         break;
+
+    case SHM_full_32:
+    {
+        unsigned long host_pfn, host_gpte;
+        
+        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+        host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+        spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW);
+    }
+        break;
+
     }
 
     *gpte_p = gpte;
@@ -214,6 +304,20 @@ static inline void l1pte_propagate_from_guest(
              (_PAGE_PRESENT|_PAGE_ACCESSED) )
             spte = gpte & ~_PAGE_RW;
         break;
+
+    case SHM_full_32:
+    {
+        unsigned long host_pfn, host_gpte;
+        
+        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
+        host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
+        spte = 0;
+
+        if ( (host_gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
+             (_PAGE_PRESENT|_PAGE_ACCESSED) )
+            spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW);
+    }
+        break;
     }
 
     *gpte_p = gpte;
@@ -239,8 +343,12 @@ static inline void l2pde_general(
 
         /* Detect linear p.t. mappings and write-protect them. */
         if ( (frame_table[sl1pfn].u.inuse.type_info & PGT_type_mask) ==
-             PGT_l2_page_table )
-            spde = gpde & ~_PAGE_RW;
+             PGT_l2_page_table ) 
+        {
+            if (m->shadow_mode != SHM_full_32)
+                spde = gpde & ~_PAGE_RW;
+
+        }
     }
 
     *gpde_p = gpde;
@@ -394,7 +502,7 @@ static inline void delete_shadow_status(
 
     head = hash_bucket(m, gpfn);
 
-    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b);
+    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, head);
     shadow_audit(m, 0);
 
     /* Match on head item? */
@@ -469,7 +577,7 @@ static inline void set_shadow_status(
 
     x = head = hash_bucket(m, gpfn);
    
-    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next);
+    SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, x, x->next);
     shadow_audit(m, 0);
 
     /*
@@ -543,7 +651,72 @@ static inline void set_shadow_status(
  done:
     shadow_audit(m, 0);
 }
+  
+#ifdef CONFIG_VMX
+#include <asm/domain_page.h>
+
+static inline void vmx_update_shadow_state(
+    struct mm_struct *mm, unsigned long gpfn, unsigned long spfn)
+{
+
+    l2_pgentry_t *mpl2e = 0;
+    l2_pgentry_t *gpl2e, *spl2e;
+
+    /* unmap the old mappings */
+    if (mm->shadow_vtable)
+        unmap_domain_mem(mm->shadow_vtable);
+    if (mm->vpagetable)
+        unmap_domain_mem(mm->vpagetable);
+
+    /* new mapping */
+    mpl2e = (l2_pgentry_t *) 
+        map_domain_mem(pagetable_val(mm->monitor_table));
+
+    mpl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    __flush_tlb_one(SH_LINEAR_PT_VIRT_START);
+
+    spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
+    gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+    memset(spl2e, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
 
+    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+    mm->shadow_vtable = spl2e;
+    mm->vpagetable = gpl2e; /* expect the guest did clean this up */
+    unmap_domain_mem(mpl2e);
+}
+
+static inline void __shadow_mk_pagetable( struct mm_struct *mm )
+{
+    unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
+    unsigned long spfn;
+    SH_VLOG("0: __shadow_mk_pagetable(gpfn=%08lx\n", gpfn);
+
+    if (mm->shadow_mode == SHM_full_32) 
+    {
+        unsigned long guest_gpfn;
+        guest_gpfn = machine_to_phys_mapping[gpfn];
+
+        SH_VVLOG("__shadow_mk_pagetable(guest_gpfn=%08lx, gpfn=%08lx\n", 
+                 guest_gpfn, gpfn);
+
+        spfn = __shadow_status(mm, gpfn) & PSH_pfn_mask;
+        if ( unlikely(spfn == 0) ) {
+            spfn = shadow_l2_table(mm, gpfn);
+            mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+        } else {
+            vmx_update_shadow_state(mm, gpfn, spfn);
+        }
+    } else {
+        spfn = __shadow_status(mm, gpfn) & PSH_pfn_mask;
+
+        if ( unlikely(spfn == 0) ) {
+            spfn = shadow_l2_table(mm, gpfn);
+        }
+        mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+    }
+}
+#else
 static inline void __shadow_mk_pagetable(struct mm_struct *mm)
 {
     unsigned long gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
@@ -554,22 +727,26 @@ static inline void __shadow_mk_pagetable(struct mm_struct *mm)
 
     mm->shadow_table = mk_pagetable(spfn << PAGE_SHIFT);
 }
+#endif /* CONFIG_VMX */
 
 static inline void shadow_mk_pagetable(struct mm_struct *mm)
 {
-    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
-             pagetable_val(mm->pagetable), mm->shadow_mode );
-
-    if ( unlikely(mm->shadow_mode) )
-    {
-        shadow_lock(mm);
-        __shadow_mk_pagetable(mm);
-        shadow_unlock(mm);
-    }
-
-    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
-             pagetable_val(mm->pagetable), mm->shadow_mode, 
-             pagetable_val(mm->shadow_table) );
+     if ( unlikely(mm->shadow_mode) )
+     {
+         SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
+             pagetable_val(mm->pagetable), mm->shadow_mode ); 
+
+         shadow_lock(mm);
+         __shadow_mk_pagetable(mm);
+         shadow_unlock(mm);
+
+     SH_VVLOG("leaving shadow_mk_pagetable:\n");
+     SH_VVLOG("( gptbase=%08lx, mode=%d ) sh=%08lx",
+              pagetable_val(mm->pagetable), mm->shadow_mode, 
+              pagetable_val(mm->shadow_table) );
+     } 
 }
 
 #if SHADOW_DEBUG
diff --git a/xen/include/asm-x86/vmx.h b/xen/include/asm-x86/vmx.h
new file mode 100644 (file)
index 0000000..b59f8d3
--- /dev/null
@@ -0,0 +1,251 @@
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_H__
+#define __ASM_X86_VMX_H__
+
+#include <xen/sched.h>
+#include <asm/types.h>
+#include <asm/regs.h>
+#include <asm/processor.h>
+#include <asm/vmx_vmcs.h>
+
+extern void vmx_asm_vmexit_handler(struct xen_regs);
+extern void vmx_asm_do_resume(void);
+extern void vmx_asm_do_launch(void);
+extern void vmx_intr_assist(struct exec_domain *d);
+
+extern void arch_vmx_do_launch(struct exec_domain *);
+extern void arch_vmx_do_resume(struct exec_domain *);
+
+extern int vmcs_size;
+extern unsigned int cpu_rev;
+
+/*
+ * Need fill bits for SENTER
+ */
+
+#define MONITOR_PIN_BASED_EXEC_CONTROLS         0x0000001f      
+#define MONITOR_CPU_BASED_EXEC_CONTROLS         0x0581e7f2
+#define MONITOR_VM_EXIT_CONTROLS                0x0003edff
+#define MONITOR_VM_ENTRY_CONTROLS               0x000011ff
+
+/*
+ * Exit Reasons
+ */
+#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI       0
+#define EXIT_REASON_EXTERNAL_INTERRUPT  1
+
+#define EXIT_REASON_PENDING_INTERRUPT   7
+
+#define EXIT_REASON_TASK_SWITCH         9
+#define EXIT_REASON_CPUID               10
+#define EXIT_REASON_HLT                 12
+#define EXIT_REASON_INVLPG              14
+#define EXIT_REASON_RDPMC               15
+#define EXIT_REASON_RDTSC               16
+#define EXIT_REASON_VMCALL              18
+
+#define EXIT_REASON_CR_ACCESS           28
+#define EXIT_REASON_DR_ACCESS           29
+#define EXIT_REASON_IO_INSTRUCTION      30
+#define EXIT_REASON_MSR_READ            31
+#define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_MWAIT_INSTRUCTION   36
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
+#define INTR_INFO_DELIEVER_CODE_MASK    0x800           /* 11 */
+#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
+
+#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
+#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control register */
+#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
+#define TYPE_MOV_TO_CR                  (0 << 4) 
+#define TYPE_MOV_FROM_CR                (1 << 4)
+#define TYPE_CLTS                       (2 << 4)
+#define CONTROL_REG_ACCESS_REG          0x700   /* 10:8, general purpose register */
+#define REG_EAX                         (0 << 8) 
+#define REG_ECX                         (1 << 8) 
+#define REG_EDX                         (2 << 8) 
+#define REG_EBX                         (3 << 8) 
+#define REG_ESP                         (4 << 8) 
+#define REG_EBP                         (5 << 8) 
+#define REG_ESI                         (6 << 8) 
+#define REG_EDI                         (7 << 8) 
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug register */
+#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
+#define TYPE_MOV_TO_DR                  (0 << 4) 
+#define TYPE_MOV_FROM_DR                (1 << 4)
+#define DEBUG_REG_ACCESS_REG            0x700   /* 11:8, general purpose register */
+#define EXCEPTION_BITMAP_DE     (1 << 0)        /* Divide Error */
+#define EXCEPTION_BITMAP_DB     (1 << 1)        /* Debug */
+#define EXCEPTION_BITMAP_NMI    (1 << 2)        /* NMI */
+#define EXCEPTION_BITMAP_BP     (1 << 3)        /* Breakpoint */
+#define EXCEPTION_BITMAP_OF     (1 << 4)        /* Overflow */
+#define EXCEPTION_BITMAP_BR     (1 << 5)        /* BOUND Range Exceeded */
+#define EXCEPTION_BITMAP_UD     (1 << 6)        /* Invalid Opcode */
+#define EXCEPTION_BITMAP_NM     (1 << 7)        /* Device Not Available */
+#define EXCEPTION_BITMAP_DF     (1 << 8)        /* Double Fault */
+/* reserved */
+#define EXCEPTION_BITMAP_TS     (1 << 10)       /* Invalid TSS */
+#define EXCEPTION_BITMAP_NP     (1 << 11)       /* Segment Not Present */
+#define EXCEPTION_BITMAP_SS     (1 << 12)       /* Stack-Segment Fault */
+#define EXCEPTION_BITMAP_GP     (1 << 13)       /* General Protection */
+#define EXCEPTION_BITMAP_PG     (1 << 14)       /* Page Fault */
+#define EXCEPTION_BITMAP_MF     (1 << 16)       /* x87 FPU Floating-Point Error (Math Fault)  */
+#define EXCEPTION_BITMAP_AC     (1 << 17)       /* Alignment Check */
+#define EXCEPTION_BITMAP_MC     (1 << 18)       /* Machine Check */
+#define EXCEPTION_BITMAP_XF     (1 << 19)       /* SIMD Floating-Point Exception */
+
+#ifdef XEN_DEBUGGER
+#define MONITOR_DEFAULT_EXCEPTION_BITMAP        \
+    ( EXCEPTION_BITMAP_PG |                     \
+      EXCEPTION_BITMAP_DB |                     \
+      EXCEPTION_BITMAP_BP |                     \
+      EXCEPTION_BITMAP_GP )
+#else
+#define MONITOR_DEFAULT_EXCEPTION_BITMAP        \
+    ( EXCEPTION_BITMAP_PG |                     \
+      EXCEPTION_BITMAP_GP )
+#endif
+
+#define VMCALL_OPCODE   ".byte 0x0f,0x01,0xc1\n"
+#define VMCLEAR_OPCODE  ".byte 0x66,0x0f,0xc7\n"        /* reg/opcode: /6 */
+#define VMLAUNCH_OPCODE ".byte 0x0f,0x01,0xc2\n"
+#define VMPTRLD_OPCODE  ".byte 0x0f,0xc7\n"             /* reg/opcode: /6 */
+#define VMPTRST_OPCODE  ".byte 0x0f,0xc7\n"             /* reg/opcode: /7 */
+#define VMREAD_OPCODE   ".byte 0x0f,0x78\n"
+#define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
+#define VMWRITE_OPCODE  ".byte 0x0f,0x79\n"
+#define VMXOFF_OPCODE   ".byte 0x0f,0x01,0xc4\n"
+#define VMXON_OPCODE    ".byte 0xf3,0x0f,0xc7\n"
+
+#define MODRM_EAX_06    ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
+#define MODRM_EAX_07    ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
+#define MODRM_EAX_ECX   ".byte 0xc1\n" /* [EAX], [ECX] */
+
+static inline int __vmptrld (u64 addr)
+{
+    unsigned long eflags;
+    __asm__ __volatile__ ( VMPTRLD_OPCODE
+                           MODRM_EAX_06
+                           :
+                           : "a" (&addr) 
+                           : "memory");
+
+    __save_flags(eflags);
+    if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+        return -1;
+    return 0;
+}
+
+static inline void __vmptrst (u64 addr)
+{
+    __asm__ __volatile__ ( VMPTRST_OPCODE
+                           MODRM_EAX_07
+                           :
+                           : "a" (&addr) 
+                           : "memory");
+}
+
+static inline int __vmpclear (u64 addr)
+{
+    unsigned long eflags;
+
+    __asm__ __volatile__ ( VMCLEAR_OPCODE
+                           MODRM_EAX_06
+                           :
+                           : "a" (&addr) 
+                           : "memory");
+    __save_flags(eflags);
+    if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+        return -1;
+    return 0;
+}
+
+static inline int __vmread (unsigned int field, void *value)
+{
+    unsigned long eflags;
+    unsigned long ecx = 0;
+
+    __asm__ __volatile__ ( VMREAD_OPCODE
+                           MODRM_EAX_ECX       
+                           : "=c" (ecx)
+                           : "a" (field)
+                           : "memory");
+
+    *((long *) value) = ecx;
+
+    __save_flags(eflags);
+    if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+        return -1;
+    return 0;
+}
+
+static inline int __vmwrite (unsigned int field, unsigned int value)
+{
+    unsigned long eflags;
+
+    __asm__ __volatile__ ( VMWRITE_OPCODE
+                           MODRM_EAX_ECX       
+                           :
+                           : "a" (field) , "c" (value)
+                           : "memory");
+    __save_flags(eflags);
+    if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+        return -1;
+    return 0;
+}
+
+static inline void __vmxoff (void)
+{
+    __asm__ __volatile__ ( VMXOFF_OPCODE 
+                           ::: "memory");
+}
+
+static inline int __vmxon (u64 addr)
+{
+    unsigned long eflags;
+
+    __asm__ __volatile__ ( VMXON_OPCODE
+                           MODRM_EAX_06
+                           :
+                           : "a" (&addr) 
+                           : "memory");
+    __save_flags(eflags);
+    if (eflags & X86_EFLAGS_ZF || eflags & X86_EFLAGS_CF)
+        return -1;
+    return 0;
+}
+#endif /* __ASM_X86_VMX_H__ */
diff --git a/xen/include/asm-x86/vmx_cpu.h b/xen/include/asm-x86/vmx_cpu.h
new file mode 100644 (file)
index 0000000..2cccc15
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * vmx_cpu.h: Virtual CPU state
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_VMCS_H__
+#define __ASM_X86_VMX_VMCS_H__
+
+/*
+ * Virtual CPU
+ */
+struct arch_state_struct {
+    unsigned long       mode_flags; /* vm86, 32-bit, 64-bit, etc. */
+    /* debug registers */
+    /* MSRs */
+};
+
+#define VMX_MF_VM86     0
+#define VMX_MF_32       1
+#define VMX_MF_64       2
+
+#endif
diff --git a/xen/include/asm-x86/vmx_platform.h b/xen/include/asm-x86/vmx_platform.h
new file mode 100644 (file)
index 0000000..f2b8a03
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ * vmx_platform.h: VMX platform support
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_PLATFORM_H__
+#define __ASM_X86_VMX_PLATFORM_H__
+
+#include <asm/e820.h>          /* from Linux */
+
+#endif
diff --git a/xen/include/asm-x86/vmx_vmcs.h b/xen/include/asm-x86/vmx_vmcs.h
new file mode 100644 (file)
index 0000000..8ec77d8
--- /dev/null
@@ -0,0 +1,225 @@
+/*
+ * vmx_vmcs.h: VMCS related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef __ASM_X86_VMX_VMCS_H__
+#define __ASM_X86_VMX_VMCS_H__
+
+#include <asm/config.h>
+#include <asm/vmx_cpu.h>
+#include <asm/vmx_platform.h>
+
+extern int start_vmx(void);
+extern void stop_vmx(void);
+
+void vmx_enter_scheduler(void);
+
+union vmcs_arbytes {
+    struct arbyte_fields {
+        unsigned int 
+        seg_type: 4, s: 1, dpl: 2, p: 1, 
+        reserved0: 4, avl: 1, reserved1: 1,     
+        default_ops_size: 1, g: 1, null_bit: 1, 
+        reserved2: 15;
+    }  __attribute__((packed)) fields;
+    unsigned int bytes;
+};
+
+struct virutal_platform_def {
+    unsigned long   *real_mode_data; /* E820, etc. */
+    unsigned long   shared_page_va;
+};
+
+int vmx_setup_platform(struct exec_domain *, execution_context_t *);
+
+#define VMX_CPU_STATE_PG_ENABLED        0       
+
+#define VMCS_SIZE                       0x1000
+
+struct vmcs_struct {
+    u32 vmcs_revision_id;
+    unsigned char data [0x1000 - sizeof (u32)];
+};
+
+struct arch_vmx_struct {
+    struct vmcs_struct      *vmcs;  /* VMCS pointer in virtual */
+    unsigned long           flags;  /* VMCS flags */
+    unsigned long           cpu_cr2; /* save CR2 */
+    unsigned long           cpu_cr3;
+    unsigned long           cpu_state;
+    struct virutal_platform_def     vmx_platform; 
+#if 0
+    /* open */
+    unsigned long *page_list; /* page list for MMIO */
+#endif
+};
+
+#define vmx_schedule_tail(next)         \
+    (next)->thread.arch_vmx.arch_vmx_schedule_tail((next))
+
+#define VMX_DOMAIN(d)   d->thread.arch_vmx.flags
+
+#define ARCH_VMX_VMCS_LOADED    0       /* VMCS has been loaded and active */
+#define ARCH_VMX_VMCS_LAUNCH    1       /* Needs VMCS launch */
+#define ARCH_VMX_VMCS_RESUME    2       /* Needs VMCS resume */
+#define ARCH_VMX_IO_WAIT        3       /* Waiting for I/O completion */
+
+void vmx_do_launch(struct exec_domain *); 
+void vmx_do_resume(struct exec_domain *); 
+
+struct vmcs_struct *alloc_vmcs(void);
+void free_vmcs(struct vmcs_struct *);
+int  load_vmcs(struct arch_vmx_struct *, u64);
+int  store_vmcs(struct arch_vmx_struct *, u64);
+void dump_vmcs(void);
+int  construct_vmcs(struct arch_vmx_struct *, execution_context_t *, 
+                    full_execution_context_t *, int);
+
+#define VMCS_USE_HOST_ENV       1
+#define VMCS_USE_SEPARATE_ENV   0
+
+#define VMCS_EFLAGS_RESERVED_0          0xffc08028 /* bitmap for 0 */
+#define VMCS_EFLAGS_RESERVED_1          0x00000002 /* bitmap for 1 */
+
+extern int vmcs_version;
+
+/* VMCS Encordings */
+enum vmcs_field {
+    GUEST_ES_SELECTOR               = 0x00000800,
+    GUEST_CS_SELECTOR               = 0x00000802,
+    GUEST_SS_SELECTOR               = 0x00000804,
+    GUEST_DS_SELECTOR               = 0x00000806,
+    GUEST_FS_SELECTOR               = 0x00000808,
+    GUEST_GS_SELECTOR               = 0x0000080a,
+    GUEST_LDTR_SELECTOR             = 0x0000080c,
+    GUEST_TR_SELECTOR               = 0x0000080e,
+    HOST_ES_SELECTOR                = 0x00000c00,
+    HOST_CS_SELECTOR                = 0x00000c02,
+    HOST_SS_SELECTOR                = 0x00000c04,
+    HOST_DS_SELECTOR                = 0x00000c06,
+    HOST_FS_SELECTOR                = 0x00000c08,
+    HOST_GS_SELECTOR                = 0x00000c0a,
+    HOST_TR_SELECTOR                = 0x00000c0c,
+    IO_BITMAP_A                     = 0x00002000, 
+    IO_BITMAP_B                     = 0x00002002, 
+    VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
+    VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
+    VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
+    TSC_OFFSET                      = 0x00002010,
+    GUEST_VMCS0                     = 0x00002800,
+    GUEST_VMCS1                     = 0x00002801,
+    GUEST_IA32_DEBUGCTL             = 0x00002802,
+    PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
+    CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,   
+    EXCEPTION_BITMAP                = 0x00004004,
+    PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
+    PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
+    CR3_TARGET_COUNT                = 0x0000400a,
+    VM_EXIT_CONTROLS                = 0x0000400c,
+    VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
+    VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
+    VM_ENTRY_CONTROLS               = 0x00004012,
+    VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
+    VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
+    VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
+    VM_EXIT_REASON                  = 0x00004402,
+    VM_EXIT_INTR_INFO               = 0x00004404,   
+    VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
+    IDT_VECTORING_INFO_FIELD        = 0x00004408,
+    IDT_VECTORING_ERROR_CODE        = 0x0000440a,
+    INSTRUCTION_LEN                 = 0x0000440c,
+    GUEST_ES_LIMIT                  = 0x00004800,
+    GUEST_CS_LIMIT                  = 0x00004802,
+    GUEST_SS_LIMIT                  = 0x00004804,
+    GUEST_DS_LIMIT                  = 0x00004806,
+    GUEST_FS_LIMIT                  = 0x00004808,
+    GUEST_GS_LIMIT                  = 0x0000480a,
+    GUEST_LDTR_LIMIT                = 0x0000480c,
+    GUEST_TR_LIMIT                  = 0x0000480e,
+    GUEST_GDTR_LIMIT                = 0x00004810,
+    GUEST_IDTR_LIMIT                = 0x00004812,
+    GUEST_ES_AR_BYTES               = 0x00004814,
+    GUEST_CS_AR_BYTES               = 0x00004816,
+    GUEST_SS_AR_BYTES               = 0x00004818,
+    GUEST_DS_AR_BYTES               = 0x0000481a,
+    GUEST_FS_AR_BYTES               = 0x0000481c,
+    GUEST_GS_AR_BYTES               = 0x0000481e,
+    GUEST_LDTR_AR_BYTES             = 0x00004820,
+    GUEST_TR_AR_BYTES               = 0x00004822,
+    GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
+    CR0_GUEST_HOST_MASK             = 0x00006000,
+    CR4_GUEST_HOST_MASK             = 0x00006002,
+    CR0_READ_SHADOW                 = 0x00006004,
+    CR4_READ_SHADOW                 = 0x00006006,
+    CR3_TARGET_VALUES               = 0x00006008, 
+    CR3_GUEST_HOST_MASK             = 0x00006208,
+    EXIT_QUALIFICATION              = 0x00006400,
+    GUEST_CR0                       = 0x00006800,
+    GUEST_CR3                       = 0x00006802,
+    GUEST_CR4                       = 0x00006804,
+    GUEST_ES_BASE                   = 0x00006806,
+    GUEST_CS_BASE                   = 0x00006808,
+    GUEST_SS_BASE                   = 0x0000680a,
+    GUEST_DS_BASE                   = 0x0000680c,
+    GUEST_FS_BASE                   = 0x0000680e,
+    GUEST_GS_BASE                   = 0x00006810,
+    GUEST_LDTR_BASE                 = 0x00006812,
+    GUEST_TR_BASE                   = 0x00006814,
+    GUEST_GDTR_BASE                 = 0x00006816,    
+    GUEST_IDTR_BASE                 = 0x00006818,
+    GUEST_DR7                       = 0x0000681a,
+    GUEST_ESP                       = 0x0000681c,
+    GUEST_EIP                       = 0x0000681e,
+    GUEST_EFLAGS                    = 0x00006820,
+    GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
+    HOST_CR0                        = 0x00006c00,
+    HOST_CR3                        = 0x00006c02,
+    HOST_CR4                        = 0x00006c04,
+    HOST_FS_BASE                    = 0x00006c06,
+    HOST_GS_BASE                    = 0x00006c08,
+    HOST_TR_BASE                    = 0x00006c0a,
+    HOST_GDTR_BASE                  = 0x00006c0c,
+    HOST_IDTR_BASE                  = 0x00006c0e,
+    HOST_ESP                        = 0x00006c14,
+    HOST_EIP                        = 0x00006c16,
+};
+
+#define VMX_DEBUG 1
+#if VMX_DEBUG
+#define DBG_LEVEL_0     (1 << 0)
+#define DBG_LEVEL_1     (1 << 1)
+#define DBG_LEVEL_2     (1 << 2)
+#define DBG_LEVEL_3     (1 << 3)
+#define DBG_LEVEL_IO    (1 << 4)
+#define DBG_LEVEL_VMMU  (1 << 5)
+
+extern unsigned int opt_vmx_debug_level;
+#define VMX_DBG_LOG(level, _f, _a...)           \
+    if ((level) & opt_vmx_debug_level)          \
+        printk("[VMX]" _f "\n", ## _a )
+#else
+#define VMX_DBG_LOG(level, _f, _a...)
+#endif
+
+#define  __vmx_bug(regs)                                        \
+    do {                                                        \
+        printk("__vmx_bug at %s:%d\n", __FILE__, __LINE__);     \
+        show_registers(regs);                                   \
+        domain_crash();                                         \
+    } while (0)
+
+#endif /* ASM_X86_VMX_VMCS_H__ */
index b7210fc1b57c12d5154148549953234eb3c035ef..a66bf2b55d5cb6484181be6f29d668741a8194cf 100644 (file)
@@ -114,6 +114,7 @@ typedef u64 tsc_timestamp_t; /* RDTSC timestamp */
  */
 typedef struct {
 #define ECF_I387_VALID (1<<0)
+#define ECF_VMX_GUEST  (2<<0)
     unsigned long flags;
     execution_context_t cpu_ctxt;           /* User-level CPU registers     */
     char          fpu_ctxt[256];            /* User-level FPU registers     */
diff --git a/xen/include/public/io/ioreq.h b/xen/include/public/io/ioreq.h
new file mode 100644 (file)
index 0000000..c10dc46
--- /dev/null
@@ -0,0 +1,59 @@
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_INVALID           0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+
+#define IOPACKET_PORT   2
+
+/* VMExit dispatcher should cooperate with instruction decoder to
+   prepare this structure and notify service OS and DM by sending
+   virq */
+typedef struct {
+    u64     addr;               /*  physical address            */
+    u64     size;               /*  size in bytes               */
+    u64     count;             /*  for rep prefixes            */
+    union {
+        u64     data;           /*  data                        */
+        void    *pdata;         /*  pointer to data             */
+    } u;
+    u8      state:5;
+    u8      pdata_valid:1;     /* if 1, use pdata above        */
+    u8      dir:1;             /*  1=read, 0=write             */
+    u8      port_mm:1;         /*  0=portio, 1=mmio            */
+} ioreq_t;
+
+#define MAX_VECTOR    256
+#define BITS_PER_BYTE   8
+#define INTR_LEN        (MAX_VECTOR/(BITS_PER_BYTE * sizeof(unsigned long)))
+
+typedef struct {
+    ioreq_t         vp_ioreq;
+    unsigned long   vp_intr[INTR_LEN];
+} vcpu_iodata_t;
+
+#endif /* _IOREQ_H_ */
index ccda04ebe2e8a3eb2684e27be8e9125abe656836..a138c64dda5c4f5ce8770d43b5f8aa6b5f2ae277 100644 (file)
@@ -26,8 +26,6 @@
 extern unsigned long volatile jiffies;
 extern rwlock_t domlist_lock;
 
-struct domain;
-
 /* A global pointer to the initial domain (DOM0). */
 extern struct domain *dom0;
 
index 0299f741369ee9725a1db945e0e7161476f7fb3e..cd55353dfe28530497549131a99e7419a47c6aa2 100644 (file)
@@ -44,5 +44,7 @@ typedef         __u32           uint32_t;
 typedef         __u64           uint64_t;
 
 
+struct domain;
+struct exec_domain;
 
 #endif /* __TYPES_H__ */